In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns 
import scipy.stats as stats
import scipy
import statsmodels.formula.api as smf
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import matplotlib.pyplot as plt
from matplotlib import ticker

In [None]:
# Cargamos los datos

raw_df = pd.read_csv('/kaggle/input/students-exam-scores/Expanded_data_with_more_features.csv')

In [None]:
raw_df.head()

In [None]:
#Verificamos los tipos de datos

raw_df.dtypes

In [None]:
# Descartamos las variables que no vamos a usar

raw_df.drop('EthnicGroup',axis=1, inplace=True)
raw_df.drop('Gender',axis=1, inplace=True)
raw_df.drop('NrSiblings',axis=1, inplace=True)
raw_df.drop('TransportMeans',axis=1, inplace=True)

raw_df.columns

In [None]:
#Renombramos las columnas

raw_df.rename(columns={'Unnamed: 0': 'Code'}, inplace=True)
raw_df.head()

In [None]:
#Validamos si contamos con valores nulos

(
raw_df
    .isnull()
    .any()
)

In [None]:
# Contamos el número total de valores nulos

(
raw_df
    .isnull()
    .sum()
    .sum()
)

In [None]:
# Observamos cuantos valores nulos hay por cada variable

(
raw_df
    .isnull()
    .sum()
)

In [None]:
# Verificamos si podemos eliminar los valores nulos sin afectar fuertemente a los datos

#raw_df['ParentEduc'].count()
#raw_df['TestPrep'].count()
#raw_df['ParentMaritalStatus'].count()
#raw_df['PracticeSport'].count()
#raw_df['IsFirstChild'].count()
#raw_df['WklyStudyHours'].count()


In [None]:
# Eliminamos los valores nulos

df = raw_df.dropna()

In [None]:
# Analizamos nuevamente los datos

df_copy.head()

# Análisis variables categoricas

In [None]:
df.select_dtypes(include=['object']).head()

In [None]:
#Contamos la cantidad de datos de las variables

df['IsFirstChild'].value_counts()

In [None]:
df['LunchType'].value_counts()

In [None]:
df['TestPrep'].value_counts()

In [None]:
df['ParentEduc'].value_counts()

In [None]:
df['ParentMaritalStatus'].value_counts()

In [None]:
df['PracticeSport'].value_counts()

In [None]:
df['WklyStudyHours'].value_counts()

In [None]:
(
    df
        .value_counts(['ParentEduc','ParentMaritalStatus'])
        .reset_index(name='count')
)

In [None]:
(
    df
        .value_counts(['PracticeSport','WklyStudyHours'])
        .reset_index(name='count')
)

In [None]:
(
    df
        .value_counts(['TestPrep','WklyStudyHours'])
        .reset_index(name='count')
)

In [None]:
(
    df
        .value_counts(['IsFirstChild','ParentMaritalStatus'])
        .reset_index(name='count')
)

In [None]:
(
    df
        .value_counts(['LunchType','ParentMaritalStatus'])
        .reset_index(name='count')
)

In [None]:
(
    df
        .value_counts(['ParentEduc','WklyStudyHours'])
        .reset_index(name='count')
)

In [None]:
(
    df
        .value_counts(['TestPrep','LunchType'])
        .reset_index(name='count')
)

# Análisis variables númericas

In [None]:
#Creamos una copia para experimentar y convertimos algunas variables categoricas a númericas.

df_copy = df.copy()

df_copy['IsFirstChild'] = df_copy['IsFirstChild'].replace({'yes': 1, 'no': 0})
df_copy['TestPrep'] = df_copy['TestPrep'].replace({'completed': 1, 'none': 0})
df_copy['LunchType'] = df_copy['TestPrep'].replace({'standard': 1, 'free/reduced': 0})

In [None]:
#Media de las calificaciones de los estudiantes

scores = df_copy[['MathScore','ReadingScore', 'WritingScore']]
scores.mean()

In [None]:
# Mediana de las calificaciones de los estudiantes

scores.median()

In [None]:
# Moda de las calificaciones de los estudiantes

scores.mode()

# Análisis Bivariado

In [None]:
df_copy.head()

In [None]:
df_copy.corr()

In [None]:
#Matriz de correlación

sns.heatmap(
    df_copy.corr(),
    annot= True,
    cmap='coolwarm',
);

In [None]:
x_1 = df_copy['ReadingScore']
y_1 = df_copy['WritingScore']

x_2 = df_copy['ReadingScore']
y_2 = df_copy['MathScore']

In [None]:
res_1 = scipy.stats.linregress(x= x_1, y= y_1)
res_2 = scipy.stats.linregress(x=x_2, y=y_2)

print(res_1,res_2,sep='\n')

In [None]:
#Gráfica de puntos

sns.jointplot(
    data = df_copy,
    x = 'ReadingScore',
    y = 'WritingScore',
)

# Análisis multivariado

In [None]:
# Regresión múltiple

model_1 = (
    smf.ols(
        formula = 'ReadingScore ~ WritingScore',
        data = df_copy
    )
    .fit()
)

model_1.summary()

In [None]:
model_2 = (
    smf.ols(
        formula = 'ReadingScore ~ WritingScore + MathScore',
        data = df_copy
    )
    .fit()
)

model_2.summary()

In [None]:
# Regresión logística

smf.logit(
    formula = 'TestPrep ~ ReadingScore + WritingScore + MathScore',
    data = df_copy
) .fit().summary()