In [1]:
import pandas as pd

In [3]:
chi = pd.read_csv("data/chicago.csv")
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [6]:
chi.info() # last column is string too

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null object
Employee Annual Salary    32062 non-null object
dtypes: object(4)
memory usage: 1002.1+ KB


In [7]:
chi['Department'].nunique() # number of unique values

35

In [9]:
chi['Department'] = chi['Department'].astype('category')
chi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null category
Employee Annual Salary    32062 non-null object
dtypes: category(1), object(3)
memory usage: 784.4+ KB


In [10]:
# standard starting point
chi = pd.read_csv("data/chicago.csv").dropna(how = 'all')
chi['Department'] = chi['Department'].astype('category')
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


# lower()  upper()  title() and len()

Prefix with .str !!!
Will return new series

In [12]:
chi['Name'].str.lower()

0            aaron,  elvia j
1          aaron,  jeffery m
2             aaron,  karina
3        aaron,  kimberlei r
4        abad jr,  vicente m
                ...         
32058     zygowicz,  peter j
32059      zymantas,  mark e
32060    zyrkowski,  carlo e
32061    zyskowski,  dariusz
32062                    NaN
Name: Name, Length: 32063, dtype: object

In [14]:
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [16]:
chi['Name'].str.title()  

0            Aaron,  Elvia J
1          Aaron,  Jeffery M
2             Aaron,  Karina
3        Aaron,  Kimberlei R
4        Abad Jr,  Vicente M
                ...         
32058     Zygowicz,  Peter J
32059      Zymantas,  Mark E
32060    Zyrkowski,  Carlo E
32061    Zyskowski,  Dariusz
32062                    NaN
Name: Name, Length: 32063, dtype: object

In [19]:
chi['Name'].str.len()

0        15.0
1        17.0
2        14.0
3        19.0
4        19.0
         ... 
32058    18.0
32059    17.0
32060    19.0
32061    19.0
32062     NaN
Name: Name, Length: 32063, dtype: float64

# .str.replace()

In [20]:
chi = pd.read_csv("data/chicago.csv").dropna(how = 'all')
chi['Department'] = chi['Department'].astype('category')
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [21]:
chi['Department'].str.replace('MGMNT', 'MANAGEMENT')

0        WATER MANAGEMENT
1                  POLICE
2                  POLICE
3        GENERAL SERVICES
4        WATER MANAGEMENT
               ...       
32057    GENERAL SERVICES
32058              POLICE
32059              POLICE
32060              POLICE
32061                DoIT
Name: Department, Length: 32062, dtype: object

In [23]:
chi['Department'] = chi['Department'].str.replace('MGMNT', 'MANAGEMENT')
chi.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MANAGEMENT,$106836.00


In [29]:
# Convert last col to floats
chi['Employee Annual Salary'] = chi['Employee Annual Salary'].str.replace('$', '').astype('float')

In [33]:
chi['Employee Annual Salary'].describe()

count     32062.000000
mean      80204.178634
std       25098.329868
min           0.960000
25%       72862.400000
50%       84450.000000
75%       93240.000000
max      300000.000000
Name: Employee Annual Salary, dtype: float64

# Filtering with String Methods


In [35]:
chi = pd.read_csv("data/chicago.csv").dropna(how = 'all')
chi['Department'] = chi['Department'].astype('category')
chi.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [39]:
# Extract any row wherethe word ''water' is present _anywhere_ in Position Title

mask = chi['Position Title'].str.lower().str.contains('water')
mask

0         True
1        False
2        False
3        False
4        False
         ...  
32057    False
32058    False
32059    False
32060    False
32061    False
Name: Position Title, Length: 32062, dtype: bool

In [40]:
chi[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
...,...,...,...,...
29669,"VERMA, ANUPAM",MANAGING ENGINEER - WATER MANAGEMENT,WATER MGMNT,$111192.00
30239,"WASHINGTON, JOSEPH",WATER CHEMIST III,WATER MGMNT,$89676.00
30544,"WEST, THOMAS R",GEN SUPT OF WATER MANAGEMENT,WATER MGMNT,$115704.00
30991,"WILLIAMS, MATTHEW",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00


In [41]:
# Extract any row wherethe word 'water' is present _in beginiung_ of Position Title
mask = chi['Position Title'].str.lower().str.startswith('water')
chi[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00
...,...,...,...,...
28574,"THREATT, DENISE R",WATER QUALITY INSPECTOR,WATER MGMNT,$62004.00
28602,"TIGNOR, DARRYL B",WATER RATE TAKER,WATER MGMNT,$78948.00
28955,"TRAVIS COOK, LESLIE R",WATER RATE TAKER,WATER MGMNT,$78948.00
29584,"VELAZQUEZ, JOHN",WATER RATE TAKER,WATER MGMNT,$78948.00


In [42]:
# Extract any row where Position Title ends with the combination 'ist'
mask = chi['Position Title'].str.lower().str.endswith('ist')
chi[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,$99840.00
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,$81948.00
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,$89880.00
472,"ALLEN, ROBERT",MACHINIST,WATER MGMNT,$94328.00
705,"ANDERSON, EDWARD M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$91476.00
...,...,...,...,...
31667,"YODER, TERESA G",ARCHIVAL SPECIALIST,PUBLIC LIBRARY,$74304.00
31688,"YOUNGBLOOM, LAURENCE G",CRIMES SURVEILLANCE SPECIALIST,OEMC,$19676.80
31717,"YOUNG, KIMBERLY M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$68556.00
31837,"ZAPATA, HUGO",SR PROCUREMENT SPECIALIST,PROCUREMENT,$87324.00


# Removing spaces: .strip()  .lstrip()  .rstrip()

In [44]:
"   Hello World   ".lstrip().rstrip()

'Hello World'

In [45]:
"   Hello World   ".strip()

'Hello World'

In [47]:
chi['Name'].str.lstrip()

0            AARON,  ELVIA J
1          AARON,  JEFFERY M
2             AARON,  KARINA
3        AARON,  KIMBERLEI R
4        ABAD JR,  VICENTE M
                ...         
32057    ZYGADLO,  MICHAEL J
32058     ZYGOWICZ,  PETER J
32059      ZYMANTAS,  MARK E
32060    ZYRKOWSKI,  CARLO E
32061    ZYSKOWSKI,  DARIUSZ
Name: Name, Length: 32062, dtype: object

#  string methods on index and columns

In [48]:
chi = pd.read_csv("data/chicago.csv", index_col = "Name").dropna(how = 'all')
chi['Department'] = chi['Department'].astype('category')
chi.tail(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [49]:
chi.index

Index(['AARON,  ELVIA J', 'AARON,  JEFFERY M', 'AARON,  KARINA',
       'AARON,  KIMBERLEI R', 'ABAD JR,  VICENTE M', 'ABARCA,  ANABEL',
       'ABARCA,  EMMANUEL', 'ABASCAL,  REECE E', 'ABBASI,  CHRISTOPHER',
       'ABBATACOLA,  ROBERT J',
       ...
       'ZWIT,  JEFFREY J', 'ZWOLFER,  MATTHEW W', 'ZYCH,  MATEUSZ',
       'ZYDEK,  BRYAN', 'ZYGADLO,  JOHN P', 'ZYGADLO,  MICHAEL J',
       'ZYGOWICZ,  PETER J', 'ZYMANTAS,  MARK E', 'ZYRKOWSKI,  CARLO E',
       'ZYSKOWSKI,  DARIUSZ'],
      dtype='object', name='Name', length=32062)

In [52]:
chi.index = chi.index.str.strip().str.title()
chi.index

Index(['Aaron,  Elvia J', 'Aaron,  Jeffery M', 'Aaron,  Karina',
       'Aaron,  Kimberlei R', 'Abad Jr,  Vicente M', 'Abarca,  Anabel',
       'Abarca,  Emmanuel', 'Abascal,  Reece E', 'Abbasi,  Christopher',
       'Abbatacola,  Robert J',
       ...
       'Zwit,  Jeffrey J', 'Zwolfer,  Matthew W', 'Zych,  Mateusz',
       'Zydek,  Bryan', 'Zygadlo,  John P', 'Zygadlo,  Michael J',
       'Zygowicz,  Peter J', 'Zymantas,  Mark E', 'Zyrkowski,  Carlo E',
       'Zyskowski,  Dariusz'],
      dtype='object', name='Name', length=32062)

In [53]:
chi.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [55]:
chi.columns =  chi.columns.str.upper()

Index(['POSITION TITLE', 'DEPARTMENT', 'EMPLOYEE ANNUAL SALARY'], dtype='object')

# .split() method

In [59]:
"Hello my name is Boris".split(" ")

['Hello', 'my', 'name', 'is', 'Boris']

In [63]:
chi = pd.read_csv("data/chicago.csv").dropna(how = 'all')
chi['Department'] = chi['Department'].astype('category')
chi.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [65]:
# What is the most common last name?

chi["Name"].str.split(',')

0            [AARON,   ELVIA J]
1          [AARON,   JEFFERY M]
2             [AARON,   KARINA]
3        [AARON,   KIMBERLEI R]
4        [ABAD JR,   VICENTE M]
                  ...          
32057    [ZYGADLO,   MICHAEL J]
32058     [ZYGOWICZ,   PETER J]
32059      [ZYMANTAS,   MARK E]
32060    [ZYRKOWSKI,   CARLO E]
32061    [ZYSKOWSKI,   DARIUSZ]
Name: Name, Length: 32062, dtype: object

In [68]:
chi["Name"].str.split(',').str.get(0).str.title() # get first elem in each list

0            Aaron
1            Aaron
2            Aaron
3            Aaron
4          Abad Jr
           ...    
32057      Zygadlo
32058     Zygowicz
32059     Zymantas
32060    Zyrkowski
32061    Zyskowski
Name: Name, Length: 32062, dtype: object

In [69]:
chi["Name"].str.split(',').str.get(0).str.title().value_counts()

Williams      293
Johnson       244
Smith         241
Brown         185
Jones         183
             ... 
Catalan         1
Shapira         1
Trollinger      1
Hrysikos        1
Ramones         1
Name: Name, Length: 13829, dtype: int64

In [74]:
# Get most common _First_ name
chi["Name"].str.split(',').str.get(1)

0              ELVIA J
1            JEFFERY M
2               KARINA
3          KIMBERLEI R
4            VICENTE M
             ...      
32057        MICHAEL J
32058          PETER J
32059           MARK E
32060          CARLO E
32061          DARIUSZ
Name: Name, Length: 32062, dtype: object

In [76]:
chi["Name"].str.split(',').str.get(1).str.split(' ')

0            [, , ELVIA, J]
1          [, , JEFFERY, M]
2              [, , KARINA]
3        [, , KIMBERLEI, R]
4          [, , VICENTE, M]
                ...        
32057      [, , MICHAEL, J]
32058        [, , PETER, J]
32059         [, , MARK, E]
32060        [, , CARLO, E]
32061         [, , DARIUSZ]
Name: Name, Length: 32062, dtype: object

In [77]:
chi["Name"].str.split(',').str.get(1).str.strip().str.split(' ')

0            [ELVIA, J]
1          [JEFFERY, M]
2              [KARINA]
3        [KIMBERLEI, R]
4          [VICENTE, M]
              ...      
32057      [MICHAEL, J]
32058        [PETER, J]
32059         [MARK, E]
32060        [CARLO, E]
32061         [DARIUSZ]
Name: Name, Length: 32062, dtype: object

In [87]:
chi["Name"].str.split(',').str.get(1).str.strip().str.split(' ').str.get(0)

0            ELVIA
1          JEFFERY
2           KARINA
3        KIMBERLEI
4          VICENTE
           ...    
32057      MICHAEL
32058        PETER
32059         MARK
32060        CARLO
32061      DARIUSZ
Name: Name, Length: 32062, dtype: object

In [89]:
chi["Name"].str.split(',').str.get(1).str.strip().str.split(' ').str.get(0).value_counts()

MICHAEL      1153
JOHN          899
JAMES         676
ROBERT        622
JOSEPH        537
             ... 
DUJUAN          1
CHASSATTY       1
AWWAD           1
ELIBERTO        1
SOLOMIS         1
Name: Name, Length: 5091, dtype: int64

# expand and n Params in str.split()

In [90]:
chi["Name"].str.split(',', expand = True)  # DataFrame will be returned

Unnamed: 0,0,1
0,AARON,ELVIA J
1,AARON,JEFFERY M
2,AARON,KARINA
3,AARON,KIMBERLEI R
4,ABAD JR,VICENTE M
...,...,...
32057,ZYGADLO,MICHAEL J
32058,ZYGOWICZ,PETER J
32059,ZYMANTAS,MARK E
32060,ZYRKOWSKI,CARLO E


In [92]:
# Add new columns to original DF
chi[["First Name", "Last Name"]] = chi["Name"].str.split(',', expand = True)
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA


In [94]:
chi["Position Title"].str.split(' ', expand = True)  # Diff number of words

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,WATER,RATE,TAKER,,,,,,
1,POLICE,OFFICER,,,,,,,
2,POLICE,OFFICER,,,,,,,
3,CHIEF,CONTRACT,EXPEDITER,,,,,,
4,CIVIL,ENGINEER,IV,,,,,,
...,...,...,...,...,...,...,...,...,...
32057,FRM,OF,MACHINISTS,-,AUTOMOTIVE,,,,
32058,POLICE,OFFICER,,,,,,,
32059,POLICE,OFFICER,,,,,,,
32060,POLICE,OFFICER,,,,,,,


In [95]:
chi["Position Title"].str.split(' ', expand = True, n = 1) # Limit the number of splits

Unnamed: 0,0,1
0,WATER,RATE TAKER
1,POLICE,OFFICER
2,POLICE,OFFICER
3,CHIEF,CONTRACT EXPEDITER
4,CIVIL,ENGINEER IV
...,...,...
32057,FRM,OF MACHINISTS - AUTOMOTIVE
32058,POLICE,OFFICER
32059,POLICE,OFFICER
32060,POLICE,OFFICER
