In [8]:
import pandas as pd

In [9]:
df = pd.read_csv("E:/dataML-DL/StudentScore.xls")
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [10]:
df.shape

(1000, 8)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [12]:
df.drop_duplicates()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [13]:
df.isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

In [14]:
# Report data
from ydata_profiling import ProfileReport
prf = ProfileReport(df,title= "Student score",explorative=True)
prf.to_file("report.html")

  from .autonotebook import tqdm as notebook_tqdm
Summarize dataset: 100%|██████████| 26/26 [00:01<00:00, 18.33it/s, Completed]                                   
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.09s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.19it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 143.56it/s]


In [15]:
# chia data to preprocessing
from sklearn.model_selection import train_test_split

target = "writing score"
x = df.drop(target,axis=1)
y = df[target]

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.2, random_state= 42)
x_test 


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score
521,female,group C,associate's degree,standard,none,91,86
737,female,group B,some college,free/reduced,completed,53,66
740,male,group D,bachelor's degree,standard,none,80,73
660,male,group C,some college,free/reduced,none,74,77
411,male,group E,some college,standard,completed,84,83
...,...,...,...,...,...,...,...
408,female,group D,high school,free/reduced,completed,52,57
332,male,group E,associate's degree,standard,completed,62,56
208,female,group B,some college,free/reduced,none,74,81
613,female,group C,associate's degree,standard,none,65,77


# Preprocessing


In [16]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OrdinalEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer


In [17]:
# math,read score
nums_transform = Pipeline(steps= [
    ("imputers", SimpleImputer(missing_values=-1, strategy="median")),
    ("scaler", StandardScaler())
])


In [18]:
# Ordinal feature
gender_values = ["male","female"]
lunch_values = x_train["lunch"].unique()
test_values = x_train["test preparation course"].unique()
education = ["bachelor's degree", 'some college', "master's degree",
       "associate's degree", 'high school', 'some high school']

ord_transform = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder",OrdinalEncoder(categories=[education,gender_values,lunch_values,test_values]))
])

In [19]:
# onehot encoder to nomerical

nom_transform = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot",OneHotEncoder())
])

In [20]:
procession = ColumnTransformer(transformers=[
    ("nums_transform",nums_transform,["math score","reading score"]),
    ("ord_transform",ord_transform,["parental level of education","gender","lunch","test preparation course"]),
    ("one_transform",nom_transform,["race/ethnicity"])
])

In [21]:
# Train model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score,mean_squared_error
reg = Pipeline(steps= [
    ("procession",procession),
    ("model", LinearRegression()),
])

#reg .fit(x_train, y_train)
#y_predict = reg.predict(x_test)

#print("MAE {}".format(mean_absolute_error(y_test, y_predict)))
#print("MSE {}".format(mean_squared_error(y_test, y_predict)))
#print("R2 {}".format(r2_score(y_test, y_predict)))

In [22]:
# Gridsearch
from sklearn.model_selection import GridSearchCV

params = {
    "model__n_estimators" :[100,200,300],
    "model__criterion":["poison","absolute_error"]
}
grid_search = GridSearchCV(estimator=reg,param_grid=params, scoring= "recall", verbose= 2)
grid_search.fit(x_train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


ValueError: Invalid parameter 'criterion' for estimator LinearRegression(). Valid parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'positive'].