In [1]:
import pandas as pd

In [3]:
chicago = pd.read_csv("chicago.csv")
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [4]:
chicago.info() # Salary is stored as objec (=string)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null object
Employee Annual Salary    32062 non-null object
dtypes: object(4)
memory usage: 1002.0+ KB


In [7]:
chicago["Department"].nunique() # The number of unique departments in this list 

35

In [8]:
chicago["Department"].count() #We can optimize memory usage by convert string to category

32062

In [9]:
chicago["Department"] = chicago["Department"].astype("category")

In [10]:
chicago.info() # reduced the memory usage by 25%

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null category
Employee Annual Salary    32062 non-null object
dtypes: category(1), object(3)
memory usage: 784.4+ KB


## Common String Method- .lower(), .upper(), .title(), and len()

In [12]:
chicago = pd.read_csv("chicago.csv")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [14]:
"HELLO World".lower()

'hello world'

In [18]:
"Hello World".upper()

'HELLO WORLD'

In [19]:
"hello world".title()

'Hello World'

In [20]:
len("Hello World") #spaces count

11

In [24]:
chicago["Name"].str.lower().head()  
#When using string method on pandas Series, put .str prefex before the string method

0        aaron,  elvia j
1      aaron,  jeffery m
2         aaron,  karina
3    aaron,  kimberlei r
4    abad jr,  vicente m
Name: Name, dtype: object

In [26]:
chicago["Name"].str.title().head() # capitalize the first word of every word

0        Aaron,  Elvia J
1      Aaron,  Jeffery M
2         Aaron,  Karina
3    Aaron,  Kimberlei R
4    Abad Jr,  Vicente M
Name: Name, dtype: object

In [30]:
chicago["Position Title"].str.title().head()

0            Water Rate Taker
1              Police Officer
2              Police Officer
3    Chief Contract Expediter
4           Civil Engineer Iv
Name: Position Title, dtype: object

In [31]:
chicago["Position Title"] = chicago["Position Title"].str.title()

In [32]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",Water Rate Taker,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",Police Officer,POLICE,$84450.00
2,"AARON, KARINA",Police Officer,POLICE,$84450.00
3,"AARON, KIMBERLEI R",Chief Contract Expediter,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",Civil Engineer Iv,WATER MGMNT,$106836.00


In [36]:
chicago["Department"].str.len().head() # this len is not the python built-in method 

0    11.0
1     6.0
2     6.0
3    16.0
4    11.0
Name: Department, dtype: float64

## The .str.replace() Method

In [37]:
chicago = pd.read_csv("chicago.csv")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [38]:
"Hello World".replace("l", "!") # replace method on a regular string

'He!!o Wor!d'

In [39]:
chicago.tail() 

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32058,"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,$87384.00
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00
32062,,,,


In [41]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all") # Use dropna(how = "all") to drop the final row
chicago["Department"] = chicago["Department"].astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [12]:
import pandas as pd
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [13]:
chicago["Department"] = chicago["Department"].str.replace("MGMNT", "MANAGEMENT")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MANAGEMENT,$106836.00


#### Turn string number to float number

In [14]:
chicago["Employee Annual Salary"].str.replace("$", "").astype(float).head() #replace $ with empty string

0     90744.0
1     84450.0
2     84450.0
3     89880.0
4    106836.0
Name: Employee Annual Salary, dtype: float64

In [16]:
chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].str.replace("$", "").astype(float) 
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,90744.0
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MANAGEMENT,106836.0


In [17]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null object
Employee Annual Salary    32062 non-null float64
dtypes: float64(1), object(3)
memory usage: 1.2+ MB
