In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import OneSidedSelection

In [2]:
df_full = pd.read_csv("StudentsPerformance.csv")

In [3]:
df_full

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [5]:
df_read = df_full.drop(columns=["math score", "writing score"])

In [6]:
df_read

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score
0,female,group B,bachelor's degree,standard,none,72
1,female,group C,some college,standard,completed,90
2,female,group B,master's degree,standard,none,95
3,male,group A,associate's degree,free/reduced,none,57
4,male,group C,some college,standard,none,78
...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99
996,male,group C,high school,free/reduced,none,55
997,female,group C,high school,free/reduced,completed,71
998,female,group D,some college,standard,completed,78


In [7]:
codes, uniques = pd.factorize(df_read["gender"])

In [8]:
codes

array([0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,

In [9]:
uniques

Index(['female', 'male'], dtype='object')

In [10]:
df_read["gender factorized"] = np.array(codes)

In [11]:
df_read

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,gender factorized
0,female,group B,bachelor's degree,standard,none,72,0
1,female,group C,some college,standard,completed,90,0
2,female,group B,master's degree,standard,none,95,0
3,male,group A,associate's degree,free/reduced,none,57,1
4,male,group C,some college,standard,none,78,1
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,0
996,male,group C,high school,free/reduced,none,55,1
997,female,group C,high school,free/reduced,completed,71,0
998,female,group D,some college,standard,completed,78,0


In [12]:
codes_race, uniques_race = pd.factorize(df_read["race/ethnicity"])

In [13]:
codes_race

array([0, 1, 0, 2, 1, 0, 0, 0, 3, 0, 1, 3, 0, 2, 2, 1, 1, 0, 1, 1, 3, 0,
       3, 1, 3, 2, 0, 1, 1, 3, 3, 0, 4, 3, 4, 4, 3, 3, 3, 0, 1, 1, 0, 0,
       4, 0, 2, 1, 3, 1, 4, 4, 1, 3, 1, 1, 4, 3, 3, 1, 4, 2, 2, 1, 3, 0,
       3, 1, 0, 1, 3, 3, 2, 1, 1, 0, 4, 2, 3, 4, 0, 0, 2, 4, 3, 1, 1, 3,
       2, 3, 1, 1, 1, 1, 0, 1, 0, 4, 3, 3, 0, 3, 3, 0, 1, 1, 3, 4, 0, 0,
       3, 1, 2, 3, 4, 1, 0, 3, 3, 1, 1, 0, 1, 3, 4, 0, 0, 3, 3, 2, 3, 1,
       4, 1, 3, 1, 0, 4, 1, 3, 3, 1, 4, 2, 3, 1, 0, 1, 3, 4, 2, 2, 0, 3,
       3, 1, 4, 0, 0, 3, 0, 4, 0, 1, 4, 1, 1, 0, 0, 1, 2, 4, 3, 1, 1, 1,
       0, 1, 0, 3, 1, 1, 4, 3, 1, 1, 4, 3, 0, 1, 4, 3, 0, 3, 1, 3, 1, 4,
       0, 0, 1, 3, 1, 0, 1, 3, 4, 4, 0, 0, 3, 1, 1, 1, 4, 0, 4, 1, 0, 0,
       3, 0, 1, 3, 0, 4, 1, 3, 2, 1, 3, 1, 0, 4, 1, 3, 3, 3, 0, 1, 3, 4,
       3, 4, 3, 1, 4, 0, 0, 1, 2, 3, 0, 3, 3, 4, 1, 1, 0, 1, 1, 1, 1, 4,
       3, 3, 1, 3, 3, 4, 1, 1, 3, 3, 0, 1, 1, 4, 1, 0, 3, 3, 3, 3, 0, 0,
       4, 0, 0, 4, 1, 3, 1, 4, 3, 0, 2, 4, 1, 3, 2,

In [14]:
uniques_race

Index(['group B', 'group C', 'group A', 'group D', 'group E'], dtype='object')

In [15]:
df_read["race/ethnicity fact"] = np.array(codes_race)

In [16]:
df_read

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,gender factorized,race/ethnicity fact
0,female,group B,bachelor's degree,standard,none,72,0,0
1,female,group C,some college,standard,completed,90,0,1
2,female,group B,master's degree,standard,none,95,0,0
3,male,group A,associate's degree,free/reduced,none,57,1,2
4,male,group C,some college,standard,none,78,1,1
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,0,4
996,male,group C,high school,free/reduced,none,55,1,1
997,female,group C,high school,free/reduced,completed,71,0,1
998,female,group D,some college,standard,completed,78,0,3


In [17]:
codes_parental, uniques_parental = pd.factorize(df_read["parental level of education"])
uniques_parental

Index(['bachelor's degree', 'some college', 'master's degree',
       'associate's degree', 'high school', 'some high school'],
      dtype='object')

In [18]:
df_read["parental level of education fact"] = np.array(codes_parental)
df_read

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,gender factorized,race/ethnicity fact,parental level of education fact
0,female,group B,bachelor's degree,standard,none,72,0,0,0
1,female,group C,some college,standard,completed,90,0,1,1
2,female,group B,master's degree,standard,none,95,0,0,2
3,male,group A,associate's degree,free/reduced,none,57,1,2,3
4,male,group C,some college,standard,none,78,1,1,1
...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,0,4,2
996,male,group C,high school,free/reduced,none,55,1,1,4
997,female,group C,high school,free/reduced,completed,71,0,1,4
998,female,group D,some college,standard,completed,78,0,3,1


In [19]:
codes_lunch, unique_lunch = pd.factorize(df_read['lunch'])
df_read['lunch fact'] = np.array(codes_lunch)
df_read

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,gender factorized,race/ethnicity fact,parental level of education fact,lunch fact
0,female,group B,bachelor's degree,standard,none,72,0,0,0,0
1,female,group C,some college,standard,completed,90,0,1,1,0
2,female,group B,master's degree,standard,none,95,0,0,2,0
3,male,group A,associate's degree,free/reduced,none,57,1,2,3,1
4,male,group C,some college,standard,none,78,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,0,4,2,0
996,male,group C,high school,free/reduced,none,55,1,1,4,1
997,female,group C,high school,free/reduced,completed,71,0,1,4,1
998,female,group D,some college,standard,completed,78,0,3,1,0


In [20]:
unique_lunch

Index(['standard', 'free/reduced'], dtype='object')

In [21]:
codes_prep, uniques_prep = pd.factorize(df_read['test preparation course'])
df_read['prep fact'] = np.array(codes_prep)

In [22]:
df_read

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,gender factorized,race/ethnicity fact,parental level of education fact,lunch fact,prep fact
0,female,group B,bachelor's degree,standard,none,72,0,0,0,0,0
1,female,group C,some college,standard,completed,90,0,1,1,0,1
2,female,group B,master's degree,standard,none,95,0,0,2,0,0
3,male,group A,associate's degree,free/reduced,none,57,1,2,3,1,0
4,male,group C,some college,standard,none,78,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,0,4,2,0,1
996,male,group C,high school,free/reduced,none,55,1,1,4,1,0
997,female,group C,high school,free/reduced,completed,71,0,1,4,1,1
998,female,group D,some college,standard,completed,78,0,3,1,0,1


In [24]:
uniques_prep

Index(['none', 'completed'], dtype='object')

In [23]:
#Linear regression
X = pd.DataFrame()

X["gender"] = df_read['gender factorized']
X["race/ethnicity"] = df_read['race/ethnicity fact']
# X['parental'] = df_math["parental level of education fact"]
X['lunch'] = df_read["lunch fact"]
X['preparation'] = df_read["prep fact"]

y = pd.DataFrame()
y['math'] = df_read['reading score']

X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = LinearRegression()

model.fit(X_train, y_train)

y_predict = model.predict(x_test)
print(r2_score(y_test, y_predict))
print(mean_absolute_error(y_test, y_predict))
print(mean_squared_error(y_test, y_predict))



0.14535037252973027
10.937081107245987
187.1426075609243


In [24]:
#MLP Regressor
X = pd.DataFrame()

X["gender"] = df_read['gender factorized']
# X["race/ethnicity"] = df_math['race/ethnicity fact']
X['parental'] = df_read["parental level of education fact"]
X['lunch'] = df_read["lunch fact"]
X['preparation'] = df_read["prep fact"]

y = pd.DataFrame()
y['math'] = df_read['reading score']

X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = MLPRegressor(random_state=0)

model.fit(X_train, y_train)

y_predict = model.predict(x_test)
print(r2_score(y_test, y_predict))
print(mean_absolute_error(y_test, y_predict))
print(mean_squared_error(y_test, y_predict))


  y = column_or_1d(y, warn=True)


-1.6073613668030093
19.77237295702179
570.9338533048208




In [25]:
# Decision Tree Regressor
X = pd.DataFrame()

X["gender"] = df_read['gender factorized']
# X["race/ethnicity"] = df_math['race/ethnicity fact']
X['parental'] = df_read["parental level of education fact"]
X['lunch'] = df_read["lunch fact"]
X['preparation'] = df_read["prep fact"]

y = pd.DataFrame()
y['math'] = df_read['reading score']

X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = DecisionTreeRegressor(random_state=0)

model.fit(X_train, y_train)

y_predict = model.predict(x_test)
print(r2_score(y_test, y_predict))
print(mean_absolute_error(y_test, y_predict))
print(mean_squared_error(y_test, y_predict))


0.08302227761090342
11.23211453944755
200.79058894709743


In [26]:
# Random Forest Regressor
X = pd.DataFrame()

X["gender"] = df_read['gender factorized']
# X["race/ethnicity"] = df_math['race/ethnicity fact']
# X['parental'] = df_math["parental level of education fact"]
X['lunch'] = df_read["lunch fact"]
X['preparation'] = df_read["prep fact"]

y = pd.DataFrame()
y['math'] = df_read['reading score']

X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = RandomForestRegressor(random_state=0)

model.fit(X_train, y_train)

y_predict = model.predict(x_test)
print(r2_score(y_test, y_predict))
print(mean_absolute_error(y_test, y_predict))
print(mean_squared_error(y_test, y_predict))

0.11674004274354044
11.071914453189178
193.40741075894803


  model.fit(X_train, y_train)


In [44]:
# # SVM
# X = pd.DataFrame()

# X["gender"] = df_math['gender factorized']
# X["race/ethnicity"] = df_math['race/ethnicity fact']
# X['parental'] = df_math["parental level of education fact"]
# X['lunch'] = df_math["lunch fact"]
# X['preparation'] = df_math["prep fact"]

# y = pd.DataFrame()
# y['math'] = df_math['math score']

# X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# model = SVC(random_state=0)

# model.fit(X_train, y_train)

# y_predict = model.predict(x_test)
# print(r2_score(y_test, y_predict))
# print(mean_absolute_error(y_test, y_predict))
# print(mean_squared_error(y_test, y_predict))

0.03566435541496615
11.65
218.52


  y = column_or_1d(y, warn=True)


In [27]:
df_read

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,gender factorized,race/ethnicity fact,parental level of education fact,lunch fact,prep fact
0,female,group B,bachelor's degree,standard,none,72,0,0,0,0,0
1,female,group C,some college,standard,completed,90,0,1,1,0,1
2,female,group B,master's degree,standard,none,95,0,0,2,0,0
3,male,group A,associate's degree,free/reduced,none,57,1,2,3,1,0
4,male,group C,some college,standard,none,78,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,0,4,2,0,1
996,male,group C,high school,free/reduced,none,55,1,1,4,1,0
997,female,group C,high school,free/reduced,completed,71,0,1,4,1,1
998,female,group D,some college,standard,completed,78,0,3,1,0,1


In [29]:
df_read.loc[df_read["reading score"] < 60, "pass/fail"] = 0

In [30]:
df_read

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,gender factorized,race/ethnicity fact,parental level of education fact,lunch fact,prep fact,pass/fail
0,female,group B,bachelor's degree,standard,none,72,0,0,0,0,0,
1,female,group C,some college,standard,completed,90,0,1,1,0,1,
2,female,group B,master's degree,standard,none,95,0,0,2,0,0,
3,male,group A,associate's degree,free/reduced,none,57,1,2,3,1,0,0.0
4,male,group C,some college,standard,none,78,1,1,1,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,0,4,2,0,1,
996,male,group C,high school,free/reduced,none,55,1,1,4,1,0,0.0
997,female,group C,high school,free/reduced,completed,71,0,1,4,1,1,
998,female,group D,some college,standard,completed,78,0,3,1,0,1,


In [31]:
df_read.loc[df_read["reading score"] >= 60, "pass/fail"] = 1

In [32]:
df_read

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,gender factorized,race/ethnicity fact,parental level of education fact,lunch fact,prep fact,pass/fail
0,female,group B,bachelor's degree,standard,none,72,0,0,0,0,0,1.0
1,female,group C,some college,standard,completed,90,0,1,1,0,1,1.0
2,female,group B,master's degree,standard,none,95,0,0,2,0,0,1.0
3,male,group A,associate's degree,free/reduced,none,57,1,2,3,1,0,0.0
4,male,group C,some college,standard,none,78,1,1,1,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,0,4,2,0,1,1.0
996,male,group C,high school,free/reduced,none,55,1,1,4,1,0,0.0
997,female,group C,high school,free/reduced,completed,71,0,1,4,1,1,1.0
998,female,group D,some college,standard,completed,78,0,3,1,0,1,1.0


In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
#Logistic regression
X = pd.DataFrame()

X["gender"] = df_read['gender factorized']
# X["race/ethnicity"] = df_math['race/ethnicity fact']
# X['parental'] = df_math["parental level of education fact"]
X['lunch'] = df_read["lunch fact"]
X['preparation'] = df_read["prep fact"]

y = pd.DataFrame()
y['math'] = df_read['pass/fail']

# smt = SMn, OTE()
# X_train_sm, y_train_sm = smt.fit_resample(X, y)
# X_traix_test, y_train, y_test = train_test_split(X_train_sm, y_train_sm, test_size=0.2, random_state=0)

oss = OneSidedSelection()
X_train_oss, y_train_oss = oss.fit_resample(X, y)
X_train, x_test, y_train, y_test = train_test_split(X_train_oss, y_train_oss, test_size=0.2, random_state=0)

# X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = LogisticRegression()

model.fit(X_train, y_train)

y_predict = model.predict(x_test)
print(classification_report(y_test, y_predict))


              precision    recall  f1-score   support

         0.0       0.33      0.13      0.19        39
         1.0       0.82      0.94      0.87       161

    accuracy                           0.78       200
   macro avg       0.57      0.53      0.53       200
weighted avg       0.72      0.78      0.74       200



  y = column_or_1d(y, warn=True)


In [52]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
#Naive Bayes regression
X = pd.DataFrame()

X["gender"] = df_read['gender factorized']
# X["race/ethnicity"] = df_math['race/ethnicity fact']
# X['parental'] = df_math["parental level of education fact"]
X['lunch'] = df_read["lunch fact"]
X['preparation'] = df_read["prep fact"]

y = pd.DataFrame()
y['math'] = df_read['pass/fail']

# smt = SMOTE()
# X_train_sm, y_train_sm = smt.fit_resample(X, y)
# X_train, x_test, y_train, y_test = train_test_split(X_train_sm, y_train_sm, test_size=0.2, random_state=0)

oss = OneSidedSelection()
X_train_oss, y_train_oss = oss.fit_resample(X, y)
X_train, x_test, y_train, y_test = train_test_split(X_train_oss, y_train_oss, test_size=0.2, random_state=0)

# X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = GaussianNB()

model.fit(X_train, y_train)

y_predict = model.predict(x_test)
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

         0.0       0.36      0.13      0.19        39
         1.0       0.82      0.94      0.88       161

    accuracy                           0.79       200
   macro avg       0.59      0.54      0.53       200
weighted avg       0.73      0.79      0.74       200



  y = column_or_1d(y, warn=True)


In [50]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
# KNN
X = pd.DataFrame()

X["gender"] = df_read['gender factorized']
# X["race/ethnicity"] = df_math['race/ethnicity fact']
# X['parental'] = df_math["parental level of education fact"]
X['lunch'] = df_read["lunch fact"]
X['preparation'] = df_read["prep fact"]

y = pd.DataFrame()
y['math'] = df_read['pass/fail']

# smt = SMOTE()
# X_train_sm, y_train_sm = smt.fit_resample(X, y)
# X_train, x_test, y_train, y_test = train_test_split(X_train_sm, y_train_sm, test_size=0.2, random_state=0)

oss = OneSidedSelection()
X_train_oss, y_train_oss = oss.fit_resample(X, y)
X_train, x_test, y_train, y_test = train_test_split(X_train_oss, y_train_oss, test_size=0.2, random_state=0)

# X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = KNeighborsClassifier()

model.fit(X_train, y_train)

y_predict = model.predict(x_test)
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

         0.0       0.31      0.28      0.29        39
         1.0       0.83      0.84      0.84       161

    accuracy                           0.73       200
   macro avg       0.57      0.56      0.57       200
weighted avg       0.73      0.73      0.73       200



  return self._fit(X, y)


In [47]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
# Decision Tree Classifier
X = pd.DataFrame()

X["gender"] = df_read['gender factorized']
# X["race/ethnicity"] = df_math['race/ethnicity fact']
# X['parental'] = df_math["parental level of education fact"]
X['lunch'] = df_read["lunch fact"]
X['preparation'] = df_read["prep fact"]

y = pd.DataFrame()
y['math'] = df_read['pass/fail']

# smt = SMOTE()
# X_train_sm, y_train_sm = smt.fit_resample(X, y)
# X_train, x_test, y_train, y_test = train_test_split(X_train_sm, y_train_sm, test_size=0.2, random_state=0)

oss = OneSidedSelection()
X_train_oss, y_train_oss = oss.fit_resample(X, y)
X_train, x_test, y_train, y_test = train_test_split(X_train_oss, y_train_oss, test_size=0.2, random_state=0)

# X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = DecisionTreeClassifier()

model.fit(X_train, y_train)

y_predict = model.predict(x_test)
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

         0.0       0.42      0.13      0.20        39
         1.0       0.82      0.96      0.88       161

    accuracy                           0.80       200
   macro avg       0.62      0.54      0.54       200
weighted avg       0.74      0.80      0.75       200



In [44]:
# SVC
X = pd.DataFrame()

X["gender"] = df_read['gender factorized']
# X["race/ethnicity"] = df_math['race/ethnicity fact']
# X['parental'] = df_math["parental level of education fact"]
X['lunch'] = df_read["lunch fact"]
X['preparation'] = df_read["prep fact"]

y = pd.DataFrame()
y['math'] = df_read['pass/fail']


# smt = SMOTE()
# X_train_sm, y_train_sm = smt.fit_resample(X, y)
# X_train, x_test, y_train, y_test = train_test_split(X_train_sm, y_train_sm, test_size=0.2, random_state=0)

oss = OneSidedSelection()
X_train_oss, y_train_oss = oss.fit_resample(X, y)
X_train, x_test, y_train, y_test = train_test_split(X_train_oss, y_train_oss, test_size=0.2, random_state=0)

# X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = SVC()

model.fit(X_train, y_train)

y_predict = model.predict(x_test)
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

         0.0       0.36      0.13      0.19        39
         1.0       0.82      0.94      0.88       161

    accuracy                           0.79       200
   macro avg       0.59      0.54      0.53       200
weighted avg       0.73      0.79      0.74       200



  y = column_or_1d(y, warn=True)
