<span style="color: #60B5FC; font-weight: bold; font-size: 24px;">01 Exploring data</span>

<span style="color: #AC1555; font-weight: bold; font-size: 18px;">Libraries</span>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sweetviz as sv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt

  from .autonotebook import tqdm as notebook_tqdm


<span style="color: #AC1555; font-weight: bold; font-size: 18px;">Data preparation and data clearning
EDA, feature importance analysis
Model selection process and parameter tuning</span>

In [3]:
df = pd.read_csv('/Users/anagilabert/repos/students_exam_scores/data/Expanded_data_with_more_features.csv', index_col=0)
df = df.drop(['ReadingScore', 'WritingScore'], axis=1)


In [4]:

df.isna().sum()

Gender                    0
EthnicGroup            1840
ParentEduc             1845
LunchType                 0
TestPrep               1830
ParentMaritalStatus    1190
PracticeSport           631
IsFirstChild            904
NrSiblings             1572
TransportMeans         3134
WklyStudyHours          955
MathScore                 0
dtype: int64

In [5]:
df.head()

Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore
0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71
1,female,group C,some college,standard,,married,sometimes,yes,0.0,,5 - 10,69
2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87
3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,,5 - 10,45
4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76


In [6]:
df.shape

(30641, 12)

In [7]:
df.dtypes

Gender                  object
EthnicGroup             object
ParentEduc              object
LunchType               object
TestPrep                object
ParentMaritalStatus     object
PracticeSport           object
IsFirstChild            object
NrSiblings             float64
TransportMeans          object
WklyStudyHours          object
MathScore                int64
dtype: object

In [8]:
df.describe(include='all')

Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore
count,30641,28801,28796,30641,28811,29451,30010,29737,29069.0,27507,29686,30641.0
unique,2,5,6,2,2,4,3,2,,2,3,
top,female,group C,some college,standard,none,married,sometimes,yes,,school_bus,5 - 10,
freq,15424,9212,6633,19905,18856,16844,15213,19082,,16145,16246,
mean,,,,,,,,,2.145894,,,66.558402
std,,,,,,,,,1.458242,,,15.361616
min,,,,,,,,,0.0,,,0.0
25%,,,,,,,,,1.0,,,56.0
50%,,,,,,,,,2.0,,,67.0
75%,,,,,,,,,3.0,,,78.0


<span style="color: #AC1555; font-weight: bold; font-size: 18px;">Format review</span> 

In [9]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

<span style="color: #AC1555; font-weight: bold; font-size: 18px;">Get unique values in each column</span> 


In [10]:
print('Unique values in the column:')
for column in df.columns:
    unique_values = df[column].value_counts().index.tolist()
    print(f"'{column}': {unique_values}")

Unique values in the column:
'gender': ['female', 'male']
'ethnicgroup': ['group C', 'group D', 'group B', 'group E', 'group A']
'parenteduc': ['some college', 'high school', "associate's degree", 'some high school', "bachelor's degree", "master's degree"]
'lunchtype': ['standard', 'free/reduced']
'testprep': ['none', 'completed']
'parentmaritalstatus': ['married', 'single', 'divorced', 'widowed']
'practicesport': ['sometimes', 'regularly', 'never']
'isfirstchild': ['yes', 'no']
'nrsiblings': [1.0, 2.0, 3.0, 0.0, 4.0, 5.0, 6.0, 7.0]
'transportmeans': ['school_bus', 'private']
'wklystudyhours': ['5 - 10', '< 5', '> 10']
'mathscore': [64, 67, 71, 70, 63, 62, 65, 66, 72, 69, 75, 73, 60, 61, 74, 76, 58, 59, 57, 77, 78, 56, 79, 80, 68, 55, 53, 54, 82, 83, 81, 51, 52, 84, 50, 85, 49, 86, 48, 47, 45, 88, 46, 87, 89, 44, 100, 90, 91, 43, 42, 93, 92, 41, 94, 95, 39, 40, 97, 96, 38, 36, 37, 99, 35, 98, 32, 33, 31, 34, 29, 30, 28, 26, 27, 25, 24, 23, 22, 21, 18, 19, 17, 20, 16, 13, 9, 10, 11, 15,

<span style="color: #AC1555; font-weight: bold; font-size: 18px;">Separate df in numeric and categorical columns</span> 

In [11]:
numeric_columns = df.select_dtypes(include=['int', 'float'])
print("Numeric columns:")
print(numeric_columns)

Numeric columns:
     nrsiblings  mathscore
0           3.0         71
1           0.0         69
2           4.0         87
3           1.0         45
4           0.0         76
..          ...        ...
816         2.0         59
890         1.0         58
911         1.0         61
934         3.0         82
960         1.0         64

[30641 rows x 2 columns]


In [12]:
categorical_columns = df.select_dtypes(include=['object'])
print("Categorical columns:")
print(categorical_columns)

Categorical columns:
     gender ethnicgroup          parenteduc     lunchtype   testprep  \
0    female         NaN   bachelor's degree      standard       none   
1    female     group C        some college      standard        NaN   
2    female     group B     master's degree      standard       none   
3      male     group A  associate's degree  free/reduced       none   
4      male     group C        some college      standard       none   
..      ...         ...                 ...           ...        ...   
816  female     group D         high school      standard       none   
890    male     group E         high school      standard       none   
911  female         NaN         high school  free/reduced  completed   
934  female     group D  associate's degree      standard  completed   
960    male     group B        some college      standard       none   

    parentmaritalstatus practicesport isfirstchild transportmeans  \
0               married     regularly        

In [13]:
for column in categorical_columns:
    percentage_values = df[column].value_counts(normalize=True) * 100
    print(f'Percentage of unique values in {column}:\n{percentage_values}\n')

Percentage of unique values in gender:
female    50.337783
male      49.662217
Name: gender, dtype: float64

Percentage of unique values in ethnicgroup:
group C    31.985001
group D    26.051179
group B    20.228464
group E    14.030763
group A     7.704594
Name: ethnicgroup, dtype: float64

Percentage of unique values in parenteduc:
some college          23.034449
high school           19.749271
associate's degree    19.273510
some high school      19.158911
bachelor's degree     11.758578
master's degree        7.025281
Name: parenteduc, dtype: float64

Percentage of unique values in lunchtype:
standard        64.961979
free/reduced    35.038021
Name: lunchtype, dtype: float64

Percentage of unique values in testprep:
none         65.447225
completed    34.552775
Name: testprep, dtype: float64

Percentage of unique values in parentmaritalstatus:
married     57.193304
single      24.097654
divorced    16.702319
widowed      2.006723
Name: parentmaritalstatus, dtype: float64

Percentag

<span style="color: #AC1555; font-weight: bold; font-size: 18px;">Report</span>

In [15]:
report = sv.analyze(df, target_feat='mathscore')
report.show_html('../reports/report.html', open_browser=False)

Done! Use 'show' commands to display/save.   |██████████| [100%]   00:00 -> (00:00 left)

Report ../reports/report.html was generated.



