In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
import xgboost as xgs

## Regression Problem

In [144]:
xgs.XGBRegressor()

In [2]:
test_data = pd.read_csv("/Users/Bnkes/Desktop/GitHub/Data-Science-and-Machine-Learning/Data/test_new.csv")
train_data = pd.read_csv("/Users/Bnkes/Desktop/GitHub/Data-Science-and-Machine-Learning/Data/train_new.csv")
train_data = train_data.dropna()

In [3]:
X = train_data.drop(["PID", "SalePrice"], axis = 1)
y = np.log(train_data["SalePrice"])


In [19]:
X.columns

Index(['Lot Frontage', 'Lot Area', 'Street', 'Neighborhood', 'Bldg Type',
       'House Style', 'Overall Qual', 'Overall Cond', 'Year Built',
       'Roof Style', 'Heating', 'Central Air', 'Electrical', 'Full Bath',
       'Half Bath', 'Bedroom AbvGr', 'TotRms AbvGrd', 'Gr Liv Area',
       'Functional', 'Screen Porch', 'Pool Area', 'Yr Sold', 'Sale Type'],
      dtype='object')

In [29]:
pd.unique(X["Bldg Type"])

array(['1Fam', 'TwnhsE', 'Twnhs', '2fmCon', 'Duplex'], dtype=object)

In [21]:
pd.unique(X['Bldg Type'])

array(['1Fam', 'TwnhsE', 'Twnhs', '2fmCon', 'Duplex'], dtype=object)

In [4]:
X["Bldg Type"] = X["Overall Cond"].astype(object)
X["Overall Cond"] = X["Overall Cond"].astype(int)
X["Year Built"] = X["Year Built"].astype(int)
X["Roof Style"] = X["Roof Style"].astype(object)

In [51]:
X[["Street", "Neighborhood", "Bldg Type", "House Style", "Roof Style", "Heating", "Central Air", "Electrical", "Functional", "Screen Porch", "Sale Type"]]

Unnamed: 0,Street,Neighborhood,Bldg Type,House Style,Roof Style,Heating,Central Air,Electrical,Functional,Screen Porch,Sale Type
0,Pave,SawyerW,6,1Story,Gable,GasA,Y,SBrkr,Typ,0,WD
1,Pave,SawyerW,7,1Story,Hip,GasA,Y,SBrkr,Typ,0,WD
3,Pave,NridgHt,5,1Story,Hip,GasA,Y,SBrkr,Typ,0,WD
4,Pave,Gilbert,5,SLvl,Gable,GasA,Y,SBrkr,Typ,0,WD
5,Pave,NAmes,6,1Story,Hip,GasA,Y,SBrkr,Typ,225,WD
...,...,...,...,...,...,...,...,...,...,...,...
2192,Pave,SawyerW,5,2Story,Gable,GasA,Y,SBrkr,Typ,0,WD
2193,Pave,SWISU,4,1.5Fin,Gable,GasA,Y,SBrkr,Typ,0,WD
2194,Pave,Gilbert,5,2Story,Gable,GasA,Y,SBrkr,Typ,0,WD
2195,Pave,Edwards,5,1Story,Gable,GasA,N,SBrkr,Typ,0,COD


In [57]:
pd.unique(X["Electrical"])

array(['SBrkr', 'FuseA', 'FuseP', 'FuseF', 'Mix'], dtype=object)

In [56]:
 OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first").fit_transform(X[["Electrical"]])

array([[0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [None]:
 ["Street", "Neighborhood", "House Style", "Roof Style", "Heating", "Central Air", "Electrical", "Functional", "Sale Type"]
    
    ,
    

In [10]:
ct = ColumnTransformer([
    ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"), make_column_selector(dtype_include=object)),
    ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
])

my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("XGB", xgs.XGBRegressor(n_jobs = -1, learning_rate = .1, n_estimators = 100, max_depth = 4, min_child_weight = 0, colsample_bytree = .7, subsample = .6, gamma = .001))
])

degrees = {
    "XGB__gamma": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0, 1e0, 1e1, 1e2, 1e3]
}


gscv = GridSearchCV(my_pipeline, degrees, cv = 5, scoring='neg_mean_squared_error')
gscv_fitted = gscv.fit(X, y)
gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_



In [11]:
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("XGB", xgs.XGBRegressor(n_jobs = -1, learning_rate = .1, n_estimators = 100, max_depth = 4, min_child_weight = 0, colsample_bytree = .7, subsample = .6, gamma = .001))
])

fitted_pipeline = my_pipeline.fit(X, y)

In [16]:
final_predictions = pd.DataFrame(
    {"PID": test_data["PID"],
    "SalePrice": np.exp(fitted_pipeline.predict(test_data))}
)

final_predictions.to_csv("Regression Prediction 1.csv", index=False)
final_predictions



Unnamed: 0,PID,SalePrice
0,907135180,128989.539062
1,528181040,199846.921875
2,528175010,206338.875000
3,531379030,193365.687500
4,923275090,124732.625000
...,...,...
600,528174060,188236.125000
601,903400180,188622.281250
602,903227150,132493.218750
603,909250070,151644.562500


In [9]:
gscv_fitted.best_params_

{'XGB__colsample_bytree': 0.7000000000000001,
 'XGB__subsample': 0.6000000000000001}

In [54]:
gscv_fitted.best_score_

-0.019997209039940508

## Classification Problem

In [32]:
test_data = pd.read_csv("/Users/ben/Documents/GitHub/DSML/Data/CAH-201803-test.csv")
train_data = pd.read_csv("/Users/ben/Documents/GitHub/DSML/Data/CAH-201803-train.csv")
test_data.columns = ["id", "gender", "age", "political axis", "education", "race", "prostitution illegal", "weed illegal", "organ sales illegal", "religious", "abortion opinion", "hookup disapproval", "weed causes hookup", "abortion banned hookup results", "woman body rights", "abortion morally wrong", "sex without love acceptable", "elected official sexual misconduct acceptable"]
train_data.columns = ["id", "gender", "age", "political party", "political axis", "education", "race", "prostitution illegal", "weed illegal", "organ sales illegal", "religious", "abortion opinion", "hookup disapproval", "weed causes hookup", "abortion banned hookup results", "woman body rights", "abortion morally wrong", "sex without love acceptable", "elected official sexual misconduct acceptable"]

In [35]:
train_data.head()

Unnamed: 0,id,gender,age,political party,political axis,education,race,prostitution illegal,weed illegal,organ sales illegal,religious,abortion opinion,hookup disapproval,weed causes hookup,abortion banned hookup results,woman body rights,abortion morally wrong,sex without love acceptable,elected official sexual misconduct acceptable
0,1,Male,53,Independent,Liberal,College degree,Black,No,No,No,"Yes, somewhat religious",Pro-Choice,No,No,Behave no differently,5,2,5,No
1,5,Female,66,Independent,Conservative,Some college,White,Yes,No,Yes,"Yes, very religious",Pro-life,Yes,Yes,Less Willing,4,5,4,No
2,7,Female,58,Democrat,Liberal,College degree,White,No,No,No,"Yes, very religious",Pro-Choice,No,No,Behave no differently,5,1,4,Yes
3,8,Male,55,Independent,Moderate,High school or less,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-life,Yes,Yes,Less Willing,4,5,4,Yes
4,9,Male,64,Republican,Conservative,High school or less,White,Yes,Yes,Yes,No,Pro-life,No,No,Behave no differently,5,1,1,Yes


In [37]:
X_train = train_data.drop(["political party", "id"], axis = 1)
X_test = test_data.drop(["id"], axis = 1)
y_train = train_data["political party"]
y_train = LabelEncoder().fit_transform(y_train)

In [44]:
ct = ColumnTransformer([
    ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"), make_column_selector(dtype_include=object)),
    ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
])

my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("SVM", SVC(kernel="linear"))
])

degrees = {
    "SVM__C": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4],
    "SVM__degree": list(range(0, 10))
}

gscv = GridSearchCV(my_pipeline, degrees, cv = 10, scoring='accuracy')
gscv_fitted = gscv.fit(X, y)
gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

In [45]:
gscv_fitted.cv_results_["mean_test_score"]

array([0.34926471, 0.34926471, 0.34926471, 0.34926471, 0.34926471,
       0.34926471, 0.34926471, 0.34926471, 0.34926471, 0.34926471,
       0.34926471, 0.34926471, 0.34926471, 0.34926471, 0.34926471,
       0.34926471, 0.34926471, 0.34926471, 0.34926471, 0.34926471,
       0.34926471, 0.34926471, 0.34926471, 0.34926471, 0.34926471,
       0.34926471, 0.34926471, 0.34926471, 0.34926471, 0.34926471,
       0.52132353, 0.52132353, 0.52132353, 0.52132353, 0.52132353,
       0.52132353, 0.52132353, 0.52132353, 0.52132353, 0.52132353,
       0.60367647, 0.60367647, 0.60367647, 0.60367647, 0.60367647,
       0.60367647, 0.60367647, 0.60367647, 0.60367647, 0.60367647,
       0.59191176, 0.59191176, 0.59191176, 0.59191176, 0.59191176,
       0.59191176, 0.59191176, 0.59191176, 0.59191176, 0.59191176,
       0.57463235, 0.57463235, 0.57463235, 0.57463235, 0.57463235,
       0.57463235, 0.57463235, 0.57463235, 0.57463235, 0.57463235,
       0.55698529, 0.55698529, 0.55698529, 0.55698529, 0.55698

In [131]:
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("SVM", SVC(kernel="linear", C=.1, degree = 0))
])

fitted_pipeline = my_pipeline.fit(X_train, y_train)

y_pred = fitted_pipeline.predict(X_train)

In [47]:
accuracy_score(y_true = y_train, y_pred = y_pred)

0.6745562130177515

In [132]:
preds = fitted_pipeline.predict(test_data)
replacement_dict = {0: "Democrat", 1: "Independent", 2: "Republican"}

vectorized_replace = np.vectorize(lambda x: replacement_dict.get(x, x))

named_preds = vectorized_replace(preds)


In [133]:
final_predictions = pd.DataFrame(
    {"id_num": test_data['id'],
    "political_affiliation_predicted": named_preds}
)

final_predictions.to_csv("Classification Prediction 1.csv", index=False)

In [76]:
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("LDA", LinearDiscriminantAnalysis())
])

degrees = {
    "LDA__solver": ["svd", "lsqr", "eigen"]
}

gscv = GridSearchCV(my_pipeline, degrees, cv = 10, scoring='accuracy')
gscv_fitted = gscv.fit(X, y)
gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

In [77]:
gscv_fitted.cv_results_["mean_test_score"]

array([0.58602941, 0.58602941, 0.58602941])

In [82]:
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("LDA", LinearDiscriminantAnalysis(solver="svd"))
])

fitted_pipeline = my_pipeline.fit(X_train, y_train)

y_pred = fitted_pipeline.predict(X_train)

accuracy_score(y_true = y_train, y_pred = y_pred)

In [79]:
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("QDA", QuadraticDiscriminantAnalysis())
])

degrees = {
    "QDA__reg_param": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0]
}

gscv = GridSearchCV(my_pipeline, degrees, cv = 10, scoring='accuracy')
gscv_fitted = gscv.fit(X, y)
gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_



In [80]:
gscv_fitted.cv_results_["mean_test_score"]

array([0.47316176, 0.49154412, 0.47352941, 0.48492647, 0.54485294,
       0.53897059])

In [141]:
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("QDA", QuadraticDiscriminantAnalysis(reg_param=.1))
])

fitted_pipeline = my_pipeline.fit(X_train, y_train)

y_pred = fitted_pipeline.predict(X_train)

accuracy_score(y_true = y_train, y_pred = y_pred)

confusion_matrix(y_true=y_train, y_pred = y_pred)



array([[53,  4,  2],
       [ 5, 49,  2],
       [ 2,  2, 50]])

In [142]:
preds = fitted_pipeline.predict(X_test)
named_preds = vectorized_replace(preds)

final_predictions = pd.DataFrame(
    {"id_num": test_data['id'],
    "political_affiliation_predicted": named_preds}
)

final_predictions.to_csv("Classification Prediction 3.csv", index=False)

In [101]:
scores = []
matrices = []
for i in range(10):
    X_qda_train, X_qda_test, y_qda_train, y_qda_test = train_test_split(X_train, y_train, random_state=i)
    scores.append(accuracy_score(y_true = y_qda_test, y_pred=fitted_pipeline.predict(X_qda_test)))
    matrices.append([i, confusion_matrix(y_true = y_qda_test, y_pred = fitted_pipeline.predict(X_qda_test))])
    
print(scores)

[0.9767441860465116, 0.9069767441860465, 0.9534883720930233, 0.9069767441860465, 0.8837209302325582, 0.9302325581395349, 0.8837209302325582, 0.8604651162790697, 0.9069767441860465, 0.9069767441860465]


In [105]:
for i in range(10):
    print(matrices[i][1])

[[15  0  0]
 [ 0  9  1]
 [ 0  0 18]]
[[12  2  0]
 [ 1 13  1]
 [ 0  0 14]]
[[13  1  0]
 [ 0 14  1]
 [ 0  0 14]]
[[11  0  0]
 [ 2 16  0]
 [ 1  1 12]]
[[16  1  1]
 [ 1  7  1]
 [ 1  0 15]]
[[15  2  1]
 [ 0 13  0]
 [ 0  0 12]]
[[12  1  1]
 [ 1 13  0]
 [ 2  0 13]]
[[16  1  1]
 [ 1  6  1]
 [ 1  1 15]]
[[15  0  0]
 [ 3 12  1]
 [ 0  0 12]]
[[13  1  0]
 [ 0 13  1]
 [ 2  0 13]]


In [109]:
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("KNN", KNeighborsClassifier(n_jobs=-1))
])

degrees = {
    "KNN__n_neighbors": list(range(0, 20)),
    "KNN__algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "KNN__p": [1, 2],
    "KNN__leaf_size": list(range(0, 50))
}

gscv = GridSearchCV(my_pipeline, degrees, cv = 10, scoring='accuracy')
gscv_fitted = gscv.fit(X, y)
gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

5520 fits failed out of a total of 80000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1600 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/ben/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ben/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ben/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/ben/anaconda3/lib/python3.11/si

In [134]:
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("KNN", KNeighborsClassifier(algorithm="ball_tree", leaf_size = 1, n_jobs =-1, n_neighbors = 3, p = 1))
])

fitted_pipeline = my_pipeline.fit(X_train, y_train)

y_pred = fitted_pipeline.predict(X_train)

accuracy_score(y_true = y_train, y_pred = y_pred)

# confusion_matrix(y_true=y_train, y_pred = y_pred)

0.7928994082840237

In [135]:
preds = fitted_pipeline.predict(X_test)
named_preds = vectorized_replace(preds)

final_predictions = pd.DataFrame(
    {"id_num": test_data['id'],
    "political_affiliation_predicted": named_preds}
)

final_predictions.to_csv("Classification Prediction 2.csv", index=False)

In [130]:
final_predictions.groupby("political_affiliation_predicted").count()

Unnamed: 0_level_0,id_num
political_affiliation_predicted,Unnamed: 1_level_1
Democrat,58
Independent,65
Republican,43


In [136]:
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("DTree", DecisionTreeClassifier())
])

degrees = {
    "DTree__ccp_alpha": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 0, 1e1, 1e2, 1e3, 1e4, 1e5]
}

gscv = GridSearchCV(my_pipeline, degrees, cv = 10, scoring='accuracy')
gscv_fitted = gscv.fit(X_train, y_train)
gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

In [139]:
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("DTree", DecisionTreeClassifier(ccp_alpha = 1e-3))
])

fitted_pipeline = my_pipeline.fit(X_train, y_train)

y_pred = fitted_pipeline.predict(X_train)

accuracy_score(y_true = y_train, y_pred = y_pred)

1.0