In [1]:
# Imports:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Load SAT Data
sat_19 = pd.read_csv('../data/sat_2019.csv')
sat_18 = pd.read_csv('../data/sat_2018.csv')
sat_17 = pd.read_csv('../data/sat_2017.csv')

In [3]:
# 1. Display the data: print the first 5 rows of each dataframe to your Jupyter notebook.
sat_17.head()

Unnamed: 0,State,Participation,Evidence-Based Reading and Writing,Math,Total
0,Alabama,5%,593,572,1165
1,Alaska,38%,547,533,1080
2,Arizona,30%,563,553,1116
3,Arkansas,3%,614,594,1208
4,California,53%,531,524,1055


In [4]:
sat_18.head()

Unnamed: 0,State,Participation,Evidence-Based Reading and Writing,Math,Total
0,Alabama,6%,595,571,1166
1,Alaska,43%,562,544,1106
2,Arizona,29%,577,572,1149
3,Arkansas,5%,592,576,1169
4,California,60%,540,536,1076


In [5]:
sat_19.head()

Unnamed: 0,State,Participation Rate,EBRW,Math,Total
0,Alabama,7%,583,560,1143
1,Alaska,41%,556,541,1097
2,Arizona,31%,569,565,1134
3,Arkansas,6%,582,559,1141
4,California,63%,534,531,1065


In [6]:
# 2. Check for missing values.
sat_17.isnull().sum()

State                                 0
Participation                         0
Evidence-Based Reading and Writing    0
Math                                  0
Total                                 0
dtype: int64

In [7]:
sat_18.isnull().sum()

State                                 0
Participation                         0
Evidence-Based Reading and Writing    0
Math                                  0
Total                                 0
dtype: int64

In [8]:
sat_19.isnull().sum()

State                 0
Participation Rate    0
EBRW                  0
Math                  0
Total                 0
dtype: int64

In [9]:
#3. Check for any obvious issues with the observations (keep in mind the minimum & maximum possible values for each test/subtest).
sat_17.describe()

Unnamed: 0,Evidence-Based Reading and Writing,Math,Total
count,51.0,51.0,51.0
mean,569.117647,547.627451,1126.098039
std,45.666901,84.909119,92.494812
min,482.0,52.0,950.0
25%,533.5,522.0,1055.5
50%,559.0,548.0,1107.0
75%,613.0,599.0,1212.0
max,644.0,651.0,1295.0


In [10]:
sat_18.describe()

Unnamed: 0,Evidence-Based Reading and Writing,Math,Total
count,51.0,51.0,51.0
mean,563.686275,556.235294,1120.019608
std,47.502627,47.772623,94.155083
min,480.0,480.0,977.0
25%,534.5,522.5,1057.5
50%,552.0,544.0,1098.0
75%,610.5,593.5,1204.0
max,643.0,655.0,1298.0


In [11]:
sat_19.describe()

Unnamed: 0,EBRW,Math,Total
count,53.0,53.0,53.0
mean,558.0,548.471698,1106.528302
std,47.554422,53.785044,100.994784
min,483.0,445.0,935.0
25%,518.0,506.0,1024.0
50%,550.0,545.0,1097.0
75%,610.0,596.0,1200.0
max,636.0,648.0,1284.0


In [12]:
# 5. Display the data types of each feature.
sat_17.dtypes

State                                 object
Participation                         object
Evidence-Based Reading and Writing     int64
Math                                   int64
Total                                  int64
dtype: object

In [13]:
sat_18.dtypes

State                                 object
Participation                         object
Evidence-Based Reading and Writing     int64
Math                                   int64
Total                                  int64
dtype: object

In [14]:
sat_19.dtypes

State                 object
Participation Rate    object
EBRW                   int64
Math                   int64
Total                  int64
dtype: object

In [15]:
#Find states not included in 2018, 2017 files

print(len(sat_19))
print(len(sat_18))
print(len(sat_17))

[state for state in sat_19.State if state not in sat_17.State.values]

53
51
51


['Puerto Rico', 'Virgin Islands']

In [16]:
#8. Drop unnecessary rows (if needed).
#sat_19 = sat_19[(sat_19.State != 'Puerto Rico') & (sat_19.State != 'Virgin Islands')]
sat_19 = sat_19.drop(sat_19.index[(sat_19.State == 'Puerto Rico') | (sat_19.State == 'Virgin Islands')])
print(len(sat_19))
print(len(sat_18))
print(len(sat_17))


51
51
51


In [18]:
#Function for cleaning strings with % signs -> converts to float
def percent_clean(percent_str):
    clean_chars = '%'
    new_str = ''
    for i, c in enumerate(percent_str):
        if c not in clean_chars:
            new_str += c
    return float(new_str)/100

#6. Fix any incorrect data types found in step 5.
# Apply function to % Participation columns
sat_17['Participation'] = sat_17['Participation'].map(percent_clean)
sat_18['Participation'] = sat_18['Participation'].map(percent_clean)
sat_19['Participation Rate'] = sat_19['Participation Rate'].map(percent_clean)

sat_19.head()


Unnamed: 0,State,Participation Rate,EBRW,Math,Total
0,Alabama,0.07,583,560,1143
1,Alaska,0.41,556,541,1097
2,Arizona,0.31,569,565,1134
3,Arkansas,0.06,582,559,1141
4,California,0.63,534,531,1065


In [19]:
# Make sure dataframes are all in the same order
sat_17.sort_values('State')
sat_18.sort_values('State')
sat_19.sort_values('State')

sat_17 = sat_17.reset_index(drop=True)
sat_18 = sat_18.reset_index(drop=True)
sat_19 = sat_19.reset_index(drop=True)

In [20]:
#7. Rename Columns.

sat_17.rename(columns={
    'State': 'state_17_sat',
    'Participation': 'participation_17_sat',
    'Evidence-Based Reading and Writing': 'erw_17_sat',
    'Math': 'math_17_sat',
    'Total': 'total_17_sat'
}, inplace=True)
sat_17.columns

sat_18.rename(columns={
    'State': 'state_18_sat',
    'Participation': 'participation_18_sat',
    'Evidence-Based Reading and Writing': 'erw_18_sat',
    'Math': 'math_18_sat',
    'Total': 'total_18_sat'
}, inplace=True)
sat_18.columns

sat_19.rename(columns={
    'State': 'state_19_sat',
    'Participation Rate': 'participation_19_sat',
    'EBRW': 'erw_19_sat',
    'Math': 'math_19_sat',
    'Total': 'total_19_sat'
}, inplace=True)
sat_19.columns

Index(['state_19_sat', 'participation_19_sat', 'erw_19_sat', 'math_19_sat',
       'total_19_sat'],
      dtype='object')

In [21]:
#9. Merge dataframes that can be merged.
sat_merged = sat_17.join(sat_18, how='inner')
sat_merged = sat_merged.join(sat_19, how='inner')
sat_merged.head()

Unnamed: 0,state_17_sat,participation_17_sat,erw_17_sat,math_17_sat,total_17_sat,state_18_sat,participation_18_sat,erw_18_sat,math_18_sat,total_18_sat,state_19_sat,participation_19_sat,erw_19_sat,math_19_sat,total_19_sat
0,Alabama,0.05,593,572,1165,Alabama,0.06,595,571,1166,Alabama,0.07,583,560,1143
1,Alaska,0.38,547,533,1080,Alaska,0.43,562,544,1106,Alaska,0.41,556,541,1097
2,Arizona,0.3,563,553,1116,Arizona,0.29,577,572,1149,Arizona,0.31,569,565,1134
3,Arkansas,0.03,614,594,1208,Arkansas,0.05,592,576,1169,Arkansas,0.06,582,559,1141
4,California,0.53,531,524,1055,California,0.6,540,536,1076,California,0.63,534,531,1065


In [22]:
sat_merged.shape

(51, 15)

In [23]:
sat_merged.dtypes

state_17_sat             object
participation_17_sat    float64
erw_17_sat                int64
math_17_sat               int64
total_17_sat              int64
state_18_sat             object
participation_18_sat    float64
erw_18_sat                int64
math_18_sat               int64
total_18_sat              int64
state_19_sat             object
participation_19_sat    float64
erw_19_sat                int64
math_19_sat               int64
total_19_sat              int64
dtype: object

In [24]:
#10. Perform any additional cleaning that you feel is necessary.
#11. Save your cleaned and merged dataframes as csv files.
sat_merged.to_csv('../data/sat_merged.csv', index = False)

In [None]:
sat_merged.describe()

In [None]:
sat_merged