In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer

In [2]:
score_hg = pd.read_csv("diem_thi_ha_giang.csv")
score_hg.head()

Unnamed: 0,TOAN,NGU VAN,TIENG ANH,LICH SU,DIA LI,GDCD,NAM
0,3.6,5.25,,,4.75,,2017
1,2.6,5.25,,,6.0,,2017
2,3.8,6.0,,,7.25,,2017
3,4.8,5.25,,,4.0,,2017
4,3.6,3.25,,,5.5,,2017


In [3]:
print("Shape of data:", score_hg.shape)
score_hg.describe()

Shape of data: (5563, 7)


Unnamed: 0,TOAN,NGU VAN,TIENG ANH,LICH SU,DIA LI,GDCD,NAM
count,5502.0,5463.0,758.0,0.0,4088.0,3408.0,5563.0
mean,3.377935,4.593813,3.305541,,5.404048,7.067415,2017.0
std,1.462806,1.601184,1.262463,,1.583839,1.319649,0.0
min,0.6,0.0,0.2,,0.5,2.0,2017.0
25%,2.4,3.5,2.4,,4.25,6.25,2017.0
50%,3.0,4.5,3.0,,5.25,7.25,2017.0
75%,4.0,5.75,3.8,,6.5,8.0,2017.0
max,9.6,8.75,9.8,,9.75,9.75,2017.0


In [4]:
# Missing at Random (MAR): propensity for a data point to be missing is not related to the missing data,but it is related to some of the observed data
# Missing Completely at Random (MCAR): certain value is missing has nothing to do with its hypothesis value and with the values of other variables
# Missing not at Random (MNAR): missing value depends on the hypothetical value

In [5]:
# MAR and MCAR: it's safe to delete the data
# Deletion
score_hg_drop_math = score_hg[pd.isnull(score_hg['TOAN']) == False]
print(score_hg_drop_math.shape)
# Dropping variable
score_hg_drop_hist = score_hg.drop('LICH SU', axis=1)
score_hg_drop_hist.head()

(5502, 7)


Unnamed: 0,TOAN,NGU VAN,TIENG ANH,DIA LI,GDCD,NAM
0,3.6,5.25,,4.75,,2017
1,2.6,5.25,,6.0,,2017
2,3.8,6.0,,7.25,,2017
3,4.8,5.25,,4.0,,2017
4,3.6,3.25,,5.5,,2017


In [6]:
# MNAR: it's not allowed to delete the missing data
# Use Time Series
# Linear regression
# Multiple imputation
# K near neighbors
# Mean, Median and Mode
score_hg_math_mean = score_hg
score_math = score_hg_math_mean.TOAN
imputer = Imputer(missing_values=float('NaN'), strategy='mean')
score_math = score_math.values.reshape(-1, 1)
transformed_score_math = imputer.fit_transform(score_math)
score_hg_math_mean.TOAN = pd.Series(transformed_score_math.tolist())
score_hg_math_mean.TOAN = score_hg_math_mean.TOAN.apply(lambda x: list(x)[0])

In [7]:
score_hg_math_mean.describe()

Unnamed: 0,TOAN,NGU VAN,TIENG ANH,LICH SU,DIA LI,GDCD,NAM
count,5563.0,5463.0,758.0,0.0,4088.0,3408.0,5563.0
mean,3.377935,4.593813,3.305541,,5.404048,7.067415,2017.0
std,1.454762,1.601184,1.262463,,1.583839,1.319649,0.0
min,0.6,0.0,0.2,,0.5,2.0,2017.0
25%,2.4,3.5,2.4,,4.25,6.25,2017.0
50%,3.0,4.5,3.0,,5.25,7.25,2017.0
75%,3.8,5.75,3.8,,6.5,8.0,2017.0
max,9.6,8.75,9.8,,9.75,9.75,2017.0


In [10]:
score_hg_math_median = score_hg
score_math = score_hg_math_median.TOAN
imputer = Imputer(missing_values=float('NaN'), strategy='median')
score_math = score_math.values.reshape(-1, 1)
transformed_score_math = imputer.fit_transform(score_math)
score_hg_math_median.TOAN = pd.Series(transformed_score_math.tolist())
score_hg_math_median.TOAN = score_hg_math_median.TOAN.apply(lambda x: list(x)[0])

In [11]:
score_hg_math_median.describe()

Unnamed: 0,TOAN,NGU VAN,TIENG ANH,LICH SU,DIA LI,GDCD,NAM
count,5563.0,5463.0,758.0,0.0,4088.0,3408.0,5563.0
mean,3.377935,4.593813,3.305541,,5.404048,7.067415,2017.0
std,1.454762,1.601184,1.262463,,1.583839,1.319649,0.0
min,0.6,0.0,0.2,,0.5,2.0,2017.0
25%,2.4,3.5,2.4,,4.25,6.25,2017.0
50%,3.0,4.5,3.0,,5.25,7.25,2017.0
75%,3.8,5.75,3.8,,6.5,8.0,2017.0
max,9.6,8.75,9.8,,9.75,9.75,2017.0


In [13]:
score_hg_math_mode = score_hg
score_math = score_hg_math_mode.TOAN
imputer = Imputer(missing_values=float('NaN'), strategy='most_frequent')
score_math = score_math.values.reshape(-1, 1)
transformed_score_math = imputer.fit_transform(score_math)
score_hg_math_mode.TOAN = pd.Series(transformed_score_math.tolist())
score_hg_math_mode.TOAN = score_hg_math_mode.TOAN.apply(lambda x: list(x)[0])

In [14]:
score_hg_math_mode.describe()

Unnamed: 0,TOAN,NGU VAN,TIENG ANH,LICH SU,DIA LI,GDCD,NAM
count,5563.0,5463.0,758.0,0.0,4088.0,3408.0,5563.0
mean,3.377935,4.593813,3.305541,,5.404048,7.067415,2017.0
std,1.454762,1.601184,1.262463,,1.583839,1.319649,0.0
min,0.6,0.0,0.2,,0.5,2.0,2017.0
25%,2.4,3.5,2.4,,4.25,6.25,2017.0
50%,3.0,4.5,3.0,,5.25,7.25,2017.0
75%,3.8,5.75,3.8,,6.5,8.0,2017.0
max,9.6,8.75,9.8,,9.75,9.75,2017.0
