# Manipulating Text data

### Import pandas

In [34]:
import pandas as pd

### Read CSV
In this set of videos we'll use a modified version of the [Student Performance](https://www.kaggle.com/spscientist/students-performance-in-exams) dataset from Kaggle. This dataset contains student perfomance to analyze the influence of parent backgrounds, test preparations, etc on students performance.

1. gender and test preparation course are upprcase
2. all scores are strings that need % removed
3. all lunch values have a space preceding each entry (hard to see)

Explore dataset and optimize by converting gender, race, lunch and test prep into categories.

In [35]:
students = pd.read_csv("StudentsPerformance.csv")
students.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,FEMALE,group B,bachelor's degree,standard,NONE,72%,72%,74%
1,FEMALE,group C,some college,standard,COMPLETED,69%,90%,88%
2,FEMALE,group B,master's degree,standard,NONE,90%,95%,93%
3,MALE,group A,associate's degree,free/reduced,NONE,47%,57%,44%
4,MALE,group C,some college,standard,NONE,76%,78%,75%


In [36]:
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   object
 6   reading score                1000 non-null   object
 7   writing score                1000 non-null   object
dtypes: object(8)
memory usage: 62.6+ KB


In [33]:
students.nunique()

gender                          2
race/ethnicity                  5
parental level of education     6
lunch                           2
test preparation course         2
math score                     81
reading score                  72
writing score                  77
dtype: int64

In [37]:
students['parental level of education'] = students['parental level of education'].astype('category')

In [38]:
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   gender                       1000 non-null   object  
 1   race/ethnicity               1000 non-null   object  
 2   parental level of education  1000 non-null   category
 3   lunch                        1000 non-null   object  
 4   test preparation course      1000 non-null   object  
 5   math score                   1000 non-null   object  
 6   reading score                1000 non-null   object  
 7   writing score                1000 non-null   object  
dtypes: category(1), object(7)
memory usage: 56.0+ KB


In [39]:
students['race/ethnicity'] = students['race/ethnicity'].astype('category')

In [40]:
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   gender                       1000 non-null   object  
 1   race/ethnicity               1000 non-null   category
 2   parental level of education  1000 non-null   category
 3   lunch                        1000 non-null   object  
 4   test preparation course      1000 non-null   object  
 5   math score                   1000 non-null   object  
 6   reading score                1000 non-null   object  
 7   writing score                1000 non-null   object  
dtypes: category(2), object(6)
memory usage: 49.4+ KB


In [41]:
students['gender'] = students['gender'].astype('category')
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   gender                       1000 non-null   category
 1   race/ethnicity               1000 non-null   category
 2   parental level of education  1000 non-null   category
 3   lunch                        1000 non-null   object  
 4   test preparation course      1000 non-null   object  
 5   math score                   1000 non-null   object  
 6   reading score                1000 non-null   object  
 7   writing score                1000 non-null   object  
dtypes: category(3), object(5)
memory usage: 42.7+ KB


In [42]:
students.nunique()

gender                          2
race/ethnicity                  5
parental level of education     6
lunch                           2
test preparation course         2
math score                     81
reading score                  72
writing score                  77
dtype: int64

In [43]:
students['lunch'] = students['lunch'].astype('category')

In [44]:
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   gender                       1000 non-null   category
 1   race/ethnicity               1000 non-null   category
 2   parental level of education  1000 non-null   category
 3   lunch                        1000 non-null   category
 4   test preparation course      1000 non-null   object  
 5   math score                   1000 non-null   object  
 6   reading score                1000 non-null   object  
 7   writing score                1000 non-null   object  
dtypes: category(4), object(4)
memory usage: 35.9+ KB


In [45]:
students['test preparation course'] = students['test preparation course'].astype('category')

In [47]:
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   gender                       1000 non-null   category
 1   race/ethnicity               1000 non-null   category
 2   parental level of education  1000 non-null   category
 3   lunch                        1000 non-null   category
 4   test preparation course      1000 non-null   category
 5   math score                   1000 non-null   object  
 6   reading score                1000 non-null   object  
 7   writing score                1000 non-null   object  
dtypes: category(5), object(3)
memory usage: 29.2+ KB


In [48]:
students.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,FEMALE,group B,bachelor's degree,standard,NONE,72%,72%,74%
1,FEMALE,group C,some college,standard,COMPLETED,69%,90%,88%
2,FEMALE,group B,master's degree,standard,NONE,90%,95%,93%
3,MALE,group A,associate's degree,free/reduced,NONE,47%,57%,44%
4,MALE,group C,some college,standard,NONE,76%,78%,75%
