# String Methods

### Import pandas

In [25]:
import pandas as pd

### Read CSV
We'll continue where we left off with the optimized dataset using categories to lower the memory usage.

In [27]:
students = pd.read_csv("StudentsPerformance.csv")
students['parental level of education'] = students['parental level of education'].astype('category')
students['race/ethnicity'] = students['race/ethnicity'].astype('category')
students['gender'] = students['gender'].astype('category')
students['lunch'] = students['lunch'].astype('category')
students.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,FEMALE,group B,bachelor's degree,standard,NONE,72%,72%,74%
1,FEMALE,group C,some college,standard,COMPLETED,69%,90%,88%
2,FEMALE,group B,master's degree,standard,NONE,90%,95%,93%
3,MALE,group A,associate's degree,free/reduced,NONE,47%,57%,44%
4,MALE,group C,some college,standard,NONE,76%,78%,75%


### Explore data

In [28]:
# optimized and ready to roll!
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   gender                       1000 non-null   category
 1   race/ethnicity               1000 non-null   category
 2   parental level of education  1000 non-null   category
 3   lunch                        1000 non-null   category
 4   test preparation course      1000 non-null   object  
 5   math score                   1000 non-null   object  
 6   reading score                1000 non-null   object  
 7   writing score                1000 non-null   object  
dtypes: category(4), object(4)
memory usage: 35.9+ KB


### String methods

In [7]:
# Python functions are useful when working with strings
"COMPLETED".lower()

'completed'

In [9]:
"completed".upper()

'COMPLETED'

In [11]:
"test preparation course: completed".title()

'Test Preparation Course: Completed'

In [12]:
# you can't use the Python string functions directly
students["test preparation course"].lower()

AttributeError: 'Series' object has no attribute 'lower'

In [13]:
# use .str before the Python string function
students["test preparation course"].str.lower()

0           none
1      completed
2           none
3           none
4           none
         ...    
995    completed
996         none
997    completed
998    completed
999         none
Name: test preparation course, Length: 1000, dtype: object

### Convert `test preparation course` to lowercase

In [19]:
students["test preparation course"] = students["test preparation course"].str.lower()

In [18]:
students.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,FEMALE,group B,bachelor's degree,standard,,72%,72%,74%
1,FEMALE,group C,some college,standard,Completed,69%,90%,88%
2,FEMALE,group B,master's degree,standard,,90%,95%,93%
3,MALE,group A,associate's degree,free/reduced,,47%,57%,44%
4,MALE,group C,some college,standard,,76%,78%,75%


### Convert `gender` to lowercase

In [21]:
students["gender"] = students["gender"].str.lower()

In [29]:
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   gender                       1000 non-null   category
 1   race/ethnicity               1000 non-null   category
 2   parental level of education  1000 non-null   category
 3   lunch                        1000 non-null   category
 4   test preparation course      1000 non-null   object  
 5   math score                   1000 non-null   object  
 6   reading score                1000 non-null   object  
 7   writing score                1000 non-null   object  
dtypes: category(4), object(4)
memory usage: 35.9+ KB


### Count the number of character in a column with `.len()`

In [30]:
students["gender"].str.len()

0      6
1      6
2      6
3      4
4      4
      ..
995    6
996    4
997    6
998    6
999    6
Name: gender, Length: 1000, dtype: int64

In [24]:
#students.to_csv("StudentsPerfomrance_lower.csv", index=False)