In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
import time
#https://portaldatascience.com/como-lidar-com-valores-faltantes-missing/

In [2]:
def read_data(path):
    if '.csv' in path:
        return pd.read_csv(path)
    elif '.xlsx' in path:
        return pd.read_excel(path)

In [3]:
df = read_data('./data/test_scores.csv')
df.head()

Unnamed: 0,school,school_setting,school_type,classroom,teaching_method,n_student,student_id,gender,lunch,pretest,posttest
0,ANKYI,Urban,Non-public,6OL,Standard,20.0,2FHT3,Female,Does not qualify,62.0,72.0
1,ANKYI,Urban,Non-public,6OL,Standard,20.0,3JIVH,Female,Does not qualify,66.0,79.0
2,ANKYI,Urban,Non-public,6OL,Standard,20.0,3XOWE,Male,Does not qualify,64.0,76.0
3,ANKYI,Urban,Non-public,6OL,Standard,20.0,556O0,Female,Does not qualify,61.0,77.0
4,ANKYI,Urban,Non-public,6OL,Standard,20.0,74LOE,Male,Does not qualify,64.0,76.0


In [97]:
def method_clean_numbers(data, column, method):
    if method == 'mean':
        data.loc[data[column].isnull(), column] = data.loc[data[column].notnull(), column].mean()
    elif method == 'median':
        data.loc[data[column].isnull(), column] = data.loc[data[column].notnull(), column].median()
    elif method == 'mode':
        data.loc[data[column].isnull(), column] = data.loc[data[column].notnull(), column].mode()[0]
    elif method == 'regression':
        correlation = data.corr(method="pearson", numeric_only=True)
        columns = correlation.columns

        values = correlation.values

        possibles_regressions = {}

        for index_x in np.arange(values.shape[1]):
            for index_y in np.arange(values.shape[0]):
                y = columns[index_y]
                
                if values[index_y][index_x] >= 0.7 and values[index_y][index_x] != 1:
                    if possibles_regressions.get(y) == None:
                        possibles_regressions[y] = []
                        
                    possibles_regressions[y].append(columns[index_x])

        for key in possibles_regressions.keys():
            series = data.loc[data[key].isnull(), key]
            if series.size > 0:
                data_x_not_null = data[data[key].notnull()][possibles_regressions[key]]
                data_y_not_null = data[data[key].notnull()][key]
                
                model = LinearRegression().fit(data_x_not_null, data_y_not_null)
                
                index_with_null = data[data[key].isnull()].index.values

                data.loc[index_with_null, key] = model.predict(data[possibles_regressions[key]].iloc[index_with_null])

                    
    data.reset_index(drop=True, inplace=True)
    
    return data
    
def method_clean_strings(data, column, method):
    if method == 'mean':
        return
        
def data_clean(df, method='listwise'):
    data = df.copy()
    
    columns = data.columns
    columns_type = data.dtypes
    
    if method == 'listwise':
        data = data.dropna()
        data.reset_index(drop=True, inplace=True)
        
        for column in columns:
            try:
                if data[data[column] == ''] or data[data[column].isnull()]:
                    data = data.drop(data[data[column] == ''])
                    data.reset_index(drop=True, inplace=True)
            except:
                continue
    else:
        for column in columns:
                if columns_type[column] == 'O':
                    try:
                        data[column] = data[column].astype('float64')
                        
                        data = method_clean_numbers(df, column, method)
                    except:
                        data.loc[data[column].isnull(), column] = np.NaN
                        data.loc[data[column] == '', column] = np.NaN
                        
                else:
                    data = method_clean_numbers(data, column, method)
    return data

In [104]:
df_2 = data_clean(df, method='regression')
df_2

Unnamed: 0,school,school_setting,school_type,classroom,teaching_method,n_student,student_id,gender,lunch,pretest,posttest
0,ANKYI,Urban,Non-public,6OL,Standard,20.0,2FHT3,Female,Does not qualify,59.465572,72.0
1,ANKYI,Urban,Non-public,6OL,Standard,20.0,3JIVH,Female,Does not qualify,65.919057,79.0
2,ANKYI,Urban,Non-public,6OL,Standard,20.0,3XOWE,Male,Does not qualify,63.153278,76.0
3,ANKYI,Urban,Non-public,6OL,Standard,20.0,556O0,Female,Does not qualify,64.075204,77.0
4,ANKYI,Urban,Non-public,6OL,Standard,20.0,74LOE,Male,Does not qualify,63.153278,76.0
...,...,...,...,...,...,...,...,...,...,...,...
2128,ZOWMK,Urban,Public,ZBH,Standard,30.0,T8LSK,Female,Does not qualify,39.000000,55.0
2129,ZOWMK,Urban,Public,ZBH,Standard,30.0,VNP26,Female,Qualifies for reduced/free lunch,38.000000,46.0
2130,ZOWMK,Urban,Public,ZBH,Standard,30.0,YDR1Z,Female,Qualifies for reduced/free lunch,45.000000,51.0
2131,ZOWMK,Urban,Public,ZBH,Standard,30.0,YUEIH,Male,Qualifies for reduced/free lunch,46.000000,53.0


In [105]:
test = df_2.corr(method="pearson", numeric_only=True)
test

Unnamed: 0,n_student,pretest,posttest
n_student,1.0,-0.499082,-0.504886
pretest,-0.499082,1.0,0.950936
posttest,-0.504886,0.950936,1.0


In [35]:
test.columns.get_loc('pretest')

1

In [95]:
df_2[["pretest"]].mean()

pretest    54.913737
dtype: float64

In [77]:
df.loc[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'pretest'] = np.NaN

In [78]:
df[df["pretest"].isnull()].index.values

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int64)

In [69]:
df[possibles_regressions[key]].iloc[index_with_null]

Unnamed: 0,posttest
0,72.0


In [22]:
df.loc[df[x_to_regression[0]].isnull(), x_to_regression[0]]

Series([], Name: pretest, dtype: float64)

In [74]:
df_2["pretest"][0], df["pretest"][0]

(62.0, 59.47076020241826)

In [38]:

model = LinearRegression().fit(df_2[['posttest']], df_2[['n_student']].values)

In [48]:
model.predict([df_2['posttest'][0:1]])[0][0]



22.04887251002618