### Import dependencies

In [3]:
# dependencies
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score
import matplotlib.pyplot as plt


# Political Classification

In [48]:
#reading in data
raw_data = pd.read_csv("C:/Users/alexa/OneDrive/Documentos/VSCode Folder/GSB544_Computing_and_ML/Kaggle_Comp/Data/CAH-201803-train.csv")

raw_data

Unnamed: 0,id_num,Q1,Q2,political_affiliation,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18
0,1,Male,53,Independent,Liberal,College degree,Black,No,No,No,"Yes, somewhat religious",Pro-Choice,No,No,Behave no differently,5,2,5,No
1,5,Female,66,Independent,Conservative,Some college,White,Yes,No,Yes,"Yes, very religious",Pro-life,Yes,Yes,Less Willing,4,5,4,No
2,7,Female,58,Democrat,Liberal,College degree,White,No,No,No,"Yes, very religious",Pro-Choice,No,No,Behave no differently,5,1,4,Yes
3,8,Male,55,Independent,Moderate,High school or less,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-life,Yes,Yes,Less Willing,4,5,4,Yes
4,9,Male,64,Republican,Conservative,High school or less,White,Yes,Yes,Yes,No,Pro-life,No,No,Behave no differently,5,1,1,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,325,Male,21,Republican,Conservative,Some college,White,Yes,No,Yes,No,Pro-Choice,Yes,No,Less Willing,5,2,5,No
165,328,Female,41,Republican,Liberal,Graduate degree,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-Choice,Yes,No,Less Willing,5,2,2,No
166,329,Male,60,Republican,Conservative,Some college,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-life,No,Yes,Behave no differently,5,5,4,Yes
167,332,Female,51,Republican,Conservative,Graduate degree,White,Yes,Yes,Yes,"Yes, very religious",Pro-life,Yes,No,Less Willing,2,5,1,No


## Data Cleaning
Data was already very clean and I decided not to change column headers as I had key to reference which columns correlated to which survey questions. Each question seemed to have some relevance. Coefficient analysis below shows that every explanatory variable has somewhat of a signifiicant impact in predicting political affiliation.

In [49]:
raw_data.dtypes

id_num                    int64
Q1                       object
Q2                        int64
political_affiliation    object
Q4                       object
Q5                       object
Q6                       object
Q7                       object
Q8                       object
Q9                       object
Q10                      object
Q11                      object
Q12                      object
Q13                      object
Q14                      object
Q15                       int64
Q16                       int64
Q17                       int64
Q18                      object
dtype: object

### Logistic regression model 
Other models were implemented with various combinations of predictor variables, different parameters, interaction variables, and polynomial transformations, but this model yielded the highest accuracy. 

In [None]:
# initial attempt at using everything for logit model

X = raw_data.drop(['id_num', 'political_affiliation'], axis=1)
y = raw_data['political_affiliation']

ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

pipeline = Pipeline(
    [('preprocessing', ct),
     ('logit', LogisticRegression()),]
)

param_grid = {
    'logit__C': [0.01, 0.1, 1, 10, 100],       # Inverse of regularization strength
    'logit__penalty': ['l1', 'elasticnet', 'none'],  # Type of regularization
    'logit__solver': ['saga', 'lbfgs', 'liblinear'],       # Solver
    'logit__class_weight': [None, 'balanced'],             # Handle class imbalance
    'logit__l1_ratio': [0.1, 0.5, 0.7, 0.9],               # Only for elasticnet penalty
}

# Perform grid search with cross-validation
gscv = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data
gscv_fitted = gscv.fit(X, y)

# Extract the best parameters and scores
print("Best Parameters:", gscv_fitted.best_params_)
print("Best Accuracy:", gscv_fitted.best_score_)


Best Parameters: {'logit__C': 1, 'logit__class_weight': 'balanced', 'logit__l1_ratio': 0.1, 'logit__penalty': 'l1', 'logit__solver': 'liblinear'}
Best Accuracy: 0.6274509803921569


1200 fits failed out of a total of 1800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
200 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\alexa\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\alexa\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\alexa\AppDa

### Creating predictions on test data 
Had to create a new CSV file with an added 'political_affiliation_predicted' column on test data for Kaggle submission.

In [None]:
# reading in test data and applying initial attempt to test data
test_raw_data = pd.read_csv("C:/Users/alexa/OneDrive/Documentos/VSCode Folder/GSB544_Computing_and_ML/Kaggle_Comp/Data/CAH-201803-test.csv")
test_raw_data 

# pipeline.fit(X, y)

final_predictions = pd.DataFrame(
    {"id_num": test_raw_data['id_num'],
    "political_affiliation_predicted": pipeline.predict(test_raw_data)}
)

Unnamed: 0,id_num,Q1,Q2,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18
0,2,Female,78,Conservative,College degree,White,Yes,Yes,No,"Yes, very religious",Pro-Choice,Yes,Yes,Behave no differently,4,5,1,Yes
1,3,Male,59,Moderate,High school or less,Black,Yes,Yes,Yes,"Yes, very religious",Pro-Choice,No,No,More Willing,5,4,5,No
2,4,Male,59,Moderate,High school or less,White,Yes,No,Yes,"Yes, very religious",Pro-life,Yes,No,Behave no differently,4,5,1,Yes
3,6,Male,52,Moderate,Graduate degree,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-Choice,No,Yes,Less Willing,5,4,4,No
4,11,Female,33,Moderate,High school or less,White,No,No,Yes,"Yes, somewhat religious",Pro-Choice,No,No,More Willing,5,5,4,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,327,Female,68,Moderate,Graduate degree,White,Yes,No,No,"Yes, very religious",Pro-life,Yes,No,Behave no differently,5,5,2,No
162,330,Male,20,Moderate,High school or less,White,Yes,Yes,Yes,No,Pro-Choice,No,No,Less Willing,5,2,5,No
163,331,Male,65,Conservative,College degree,Latino,Yes,No,No,No,Pro-Choice,Yes,No,Behave no differently,5,2,1,No
164,333,Female,54,Moderate,Graduate degree,White,Yes,No,No,No,Pro-Choice,No,No,Behave no differently,5,1,5,Yes


In [None]:
final_predictions
#for model 1 above

Unnamed: 0,id_num,political_affiliation_predicted
0,2,Republican
1,3,Democrat
2,4,Democrat
3,6,Republican
4,11,Independent
...,...,...
161,327,Democrat
162,330,Independent
163,331,Democrat
164,333,Democrat


### Extra Coefficient Analysis on above model
Wanted to examine how each variable influenced political affiliation to see if there were any variables that did not increase accuracy for any given model. We can see that 'dummify__Q5_Some college' has the lowest overall impact with a coef value of 0.108, which is stil large enough to be included in the model.

In [None]:
pipeline.fit(X, y)

coefs = pipeline.named_steps["logit"].coef_

coefs

# Get feature names
feature_names = pipeline.named_steps["preprocessing"].get_feature_names_out()

# Get coefficients and classes
coefs = pipeline.named_steps["logit"].coef_  # Coefficients for each class
classes = pipeline.named_steps["logit"].classes_  # Target class names

# Create a DataFrame for easy manipulation
coefficients_df = pd.DataFrame(coefs.T, index=feature_names, columns=classes)

# Calculate overall impact for each feature
coefficients_df['Overall_Impact'] = coefficients_df.abs().sum(axis=1)

# Sort by overall impact
coefficients_df = coefficients_df.sort_values(by='Overall_Impact', ascending=False)

coefficients_df

Unnamed: 0,Democrat,Independent,Republican,Overall_Impact
dummify__Q4_Conservative,-0.992039,-0.297187,1.289226,2.578452
dummify__Q4_Liberal,0.776302,-0.048345,-0.727957,1.552604
dummify__Q6_White,-0.622137,0.17539,0.446746,1.244273
dummify__Q4_Moderate,0.216469,0.345926,-0.562395,1.124791
"dummify__Q10_Yes, very religious",0.524718,-0.058498,-0.466219,1.049436
dummify__Q6_Black,0.490328,-0.518574,0.028246,1.037148
dummify__Q6_Asian,-0.082205,0.496,-0.413795,0.992
dummify__Q14_Behave no differently,0.466673,-0.053474,-0.413199,0.933345
dummify__Q5_Graduate degree,-0.212793,-0.226436,0.439228,0.878457
standardize__Q17,-0.244484,0.425446,-0.180962,0.850892


# House Prices

### Import Dependencies

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

### Reading in data

In [5]:
house_raw = pd.read_csv("C:/Users/alexa/OneDrive/Documentos/VSCode Folder/GSB544_Computing_and_ML/Kaggle_Comp/Data/train_new.csv")
house_raw


Unnamed: 0,SalePrice,PID,Lot Frontage,Lot Area,Street,Neighborhood,Bldg Type,House Style,Overall Qual,Overall Cond,...,Full Bath,Half Bath,Bedroom AbvGr,TotRms AbvGrd,Gr Liv Area,Functional,Screen Porch,Pool Area,Yr Sold,Sale Type
0,159000,531363010,80.0,9605,Pave,SawyerW,1Fam,1Story,7,6,...,1,1,3,6,1218,Typ,0,0,2009,WD
1,271900,906203120,90.0,14684,Pave,SawyerW,1Fam,1Story,7,7,...,2,0,3,7,2196,Typ,0,0,2009,WD
2,137500,916176030,,14375,Pave,Timber,1Fam,SLvl,6,6,...,1,0,3,7,1344,Typ,233,0,2009,COD
3,248500,528180130,48.0,6472,Pave,NridgHt,TwnhsE,1Story,9,5,...,2,0,2,6,1456,Typ,0,0,2009,WD
4,167000,528290030,61.0,9734,Pave,Gilbert,1Fam,SLvl,7,5,...,2,1,3,7,1374,Typ,0,0,2009,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2192,220000,906420020,80.0,10041,Pave,SawyerW,1Fam,2Story,8,5,...,2,1,3,8,1915,Typ,0,0,2006,WD
2193,160000,909129090,70.0,6300,Pave,SWISU,1Fam,1.5Fin,5,4,...,1,1,3,7,1268,Typ,0,0,2009,WD
2194,225000,528292060,41.0,12460,Pave,Gilbert,1Fam,2Story,7,5,...,2,1,4,8,2322,Typ,0,0,2008,WD
2195,83000,905426060,85.0,10625,Pave,Edwards,1Fam,1Story,5,5,...,1,0,2,5,835,Typ,0,0,2010,COD


### Data cleaning
By looking at the data in the 'data wrangler' extension from VS code, 'Lot Frontage' is the only variable with a large amount of missing values (16% of values). To fill in these missing values, a random forest model uses all other predictors variables to predict lot frontage missing values. Now we have a dataset with only a few missing values.

In [6]:
# cleaning data to replace missing lot fronatage values with predicted lot frontage values

# Step 1: Analyze correlations to determine relevant features
# Select only numeric columns for correlation analysis
numeric_data = house_raw.select_dtypes(include=[np.number])
correlations = numeric_data.corr()
lot_frontage_corr = correlations['Lot Frontage'].sort_values(ascending=False)

# Select top correlated features (excluding Lot Frontage itself)
relevant_features = lot_frontage_corr.index[1:6]

# Step 2: Prepare data for modeling
# Include only relevant features for modeling, add categorical feature 'Neighborhood'
features = list(relevant_features) + ['Neighborhood']
data_model = house_raw[features + ['Lot Frontage']].copy()

# Convert categorical variables to dummy/indicator variables
data_model = pd.get_dummies(data_model, columns=['Neighborhood'], drop_first=True)

# Split the dataset into rows with and without "Lot Frontage"
data_with_frontage = data_model.dropna(subset=['Lot Frontage'])
data_without_frontage = data_model[data_model['Lot Frontage'].isnull()]

# Step 3: Debugging to ensure rows are available for missing data prediction
print("Shape of data_without_frontage:", data_without_frontage.shape)

# Separate predictors and target for rows with frontage
X = data_with_frontage.drop(columns=['Lot Frontage'])
y = data_with_frontage['Lot Frontage']

# Ensure consistent feature alignment between training and missing data
X_missing = data_without_frontage.drop(columns=['Lot Frontage'])
X_missing = X_missing.reindex(columns=X.columns, fill_value=0)

# Debugging output
print("Shape of X_missing after alignment:", X_missing.shape)

# Step 4: Train and evaluate the model if rows are available
if not X_missing.empty:
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error
    import numpy as np

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the regression model
    model = RandomForestRegressor(random_state=42, n_estimators=100)
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Predict missing "Lot Frontage" values
    predicted_frontage = model.predict(X_missing)

    # Fill the missing values
    house_raw.loc[house_raw['Lot Frontage'].isnull(), 'Lot Frontage'] = predicted_frontage

    # Display the RMSE and the updated dataset head
    print("Model RMSE:", rmse)
    print(house_raw.head())
else:
    print("Error: No rows available for prediction. Check preprocessing steps.")


Shape of data_without_frontage: (362, 33)
Shape of X_missing after alignment: (362, 32)
Model RMSE: 17.922856831507772
   SalePrice        PID  Lot Frontage  Lot Area Street Neighborhood Bldg Type  \
0     159000  531363010         80.00      9605   Pave      SawyerW      1Fam   
1     271900  906203120         90.00     14684   Pave      SawyerW      1Fam   
2     137500  916176030         90.13     14375   Pave       Timber      1Fam   
3     248500  528180130         48.00      6472   Pave      NridgHt    TwnhsE   
4     167000  528290030         61.00      9734   Pave      Gilbert      1Fam   

  House Style  Overall Qual  Overall Cond  ...  Full Bath Half Bath  \
0      1Story             7             6  ...          1         1   
1      1Story             7             7  ...          2         0   
2        SLvl             6             6  ...          1         0   
3      1Story             9             5  ...          2         0   
4        SLvl             7            

In [7]:
house_raw

Unnamed: 0,SalePrice,PID,Lot Frontage,Lot Area,Street,Neighborhood,Bldg Type,House Style,Overall Qual,Overall Cond,...,Full Bath,Half Bath,Bedroom AbvGr,TotRms AbvGrd,Gr Liv Area,Functional,Screen Porch,Pool Area,Yr Sold,Sale Type
0,159000,531363010,80.00,9605,Pave,SawyerW,1Fam,1Story,7,6,...,1,1,3,6,1218,Typ,0,0,2009,WD
1,271900,906203120,90.00,14684,Pave,SawyerW,1Fam,1Story,7,7,...,2,0,3,7,2196,Typ,0,0,2009,WD
2,137500,916176030,90.13,14375,Pave,Timber,1Fam,SLvl,6,6,...,1,0,3,7,1344,Typ,233,0,2009,COD
3,248500,528180130,48.00,6472,Pave,NridgHt,TwnhsE,1Story,9,5,...,2,0,2,6,1456,Typ,0,0,2009,WD
4,167000,528290030,61.00,9734,Pave,Gilbert,1Fam,SLvl,7,5,...,2,1,3,7,1374,Typ,0,0,2009,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2192,220000,906420020,80.00,10041,Pave,SawyerW,1Fam,2Story,8,5,...,2,1,3,8,1915,Typ,0,0,2006,WD
2193,160000,909129090,70.00,6300,Pave,SWISU,1Fam,1.5Fin,5,4,...,1,1,3,7,1268,Typ,0,0,2009,WD
2194,225000,528292060,41.00,12460,Pave,Gilbert,1Fam,2Story,7,5,...,2,1,4,8,2322,Typ,0,0,2008,WD
2195,83000,905426060,85.00,10625,Pave,Edwards,1Fam,1Story,5,5,...,1,0,2,5,835,Typ,0,0,2010,COD


Here we are dropping the one row with missing values from other predictor variables.

In [8]:
house_clean = house_raw.dropna()
house_clean

Unnamed: 0,SalePrice,PID,Lot Frontage,Lot Area,Street,Neighborhood,Bldg Type,House Style,Overall Qual,Overall Cond,...,Full Bath,Half Bath,Bedroom AbvGr,TotRms AbvGrd,Gr Liv Area,Functional,Screen Porch,Pool Area,Yr Sold,Sale Type
0,159000,531363010,80.00,9605,Pave,SawyerW,1Fam,1Story,7,6,...,1,1,3,6,1218,Typ,0,0,2009,WD
1,271900,906203120,90.00,14684,Pave,SawyerW,1Fam,1Story,7,7,...,2,0,3,7,2196,Typ,0,0,2009,WD
2,137500,916176030,90.13,14375,Pave,Timber,1Fam,SLvl,6,6,...,1,0,3,7,1344,Typ,233,0,2009,COD
3,248500,528180130,48.00,6472,Pave,NridgHt,TwnhsE,1Story,9,5,...,2,0,2,6,1456,Typ,0,0,2009,WD
4,167000,528290030,61.00,9734,Pave,Gilbert,1Fam,SLvl,7,5,...,2,1,3,7,1374,Typ,0,0,2009,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2192,220000,906420020,80.00,10041,Pave,SawyerW,1Fam,2Story,8,5,...,2,1,3,8,1915,Typ,0,0,2006,WD
2193,160000,909129090,70.00,6300,Pave,SWISU,1Fam,1.5Fin,5,4,...,1,1,3,7,1268,Typ,0,0,2009,WD
2194,225000,528292060,41.00,12460,Pave,Gilbert,1Fam,2Story,7,5,...,2,1,4,8,2322,Typ,0,0,2008,WD
2195,83000,905426060,85.00,10625,Pave,Edwards,1Fam,1Story,5,5,...,1,0,2,5,835,Typ,0,0,2010,COD


#### Must convert SalePrice to log(SalePrice) before creating any models, then find the RMSE using the log(y_pred) from the test data to find the RMSE, then convert the log(y_pred) back to regular dollars by exponentiating it for the final test data CSV predictions on Kaggle.

In [9]:
# converting SalesPrice into log(SalePrice)
house_clean['LogSalePrice'] = np.log(house_clean['SalePrice'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house_clean['LogSalePrice'] = np.log(house_clean['SalePrice'])


In [10]:
house_clean

Unnamed: 0,SalePrice,PID,Lot Frontage,Lot Area,Street,Neighborhood,Bldg Type,House Style,Overall Qual,Overall Cond,...,Half Bath,Bedroom AbvGr,TotRms AbvGrd,Gr Liv Area,Functional,Screen Porch,Pool Area,Yr Sold,Sale Type,LogSalePrice
0,159000,531363010,80.00,9605,Pave,SawyerW,1Fam,1Story,7,6,...,1,3,6,1218,Typ,0,0,2009,WD,11.976659
1,271900,906203120,90.00,14684,Pave,SawyerW,1Fam,1Story,7,7,...,0,3,7,2196,Typ,0,0,2009,WD,12.513190
2,137500,916176030,90.13,14375,Pave,Timber,1Fam,SLvl,6,6,...,0,3,7,1344,Typ,233,0,2009,COD,11.831379
3,248500,528180130,48.00,6472,Pave,NridgHt,TwnhsE,1Story,9,5,...,0,2,6,1456,Typ,0,0,2009,WD,12.423198
4,167000,528290030,61.00,9734,Pave,Gilbert,1Fam,SLvl,7,5,...,1,3,7,1374,Typ,0,0,2009,WD,12.025749
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2192,220000,906420020,80.00,10041,Pave,SawyerW,1Fam,2Story,8,5,...,1,3,8,1915,Typ,0,0,2006,WD,12.301383
2193,160000,909129090,70.00,6300,Pave,SWISU,1Fam,1.5Fin,5,4,...,1,3,7,1268,Typ,0,0,2009,WD,11.982929
2194,225000,528292060,41.00,12460,Pave,Gilbert,1Fam,2Story,7,5,...,1,4,8,2322,Typ,0,0,2008,WD,12.323856
2195,83000,905426060,85.00,10625,Pave,Edwards,1Fam,1Story,5,5,...,0,2,5,835,Typ,0,0,2010,COD,11.326596


### Initial Ridge Model with all predictors
For this and all following models, RMSE is used as Kaggle's preferred model evaluation metric. I have chosen to use a grid search method to tune hyperparameters of each chosen model with their given predictor variables.

In [18]:
# ridge regression with all predictors
from sklearn.linear_model import Ridge
X = house_clean.drop(['SalePrice', 'PID', 'LogSalePrice'], axis=1)
y = house_clean['LogSalePrice']

# Define the ColumnTransformer for preprocessing
ct = ColumnTransformer(
    [
        ("dummify", 
         OneHotEncoder(sparse_output=False, handle_unknown='ignore'), 
         make_column_selector(dtype_include=object)),
        ("standardize", 
         StandardScaler(), 
         make_column_selector(dtype_include=np.number))
    ],
    remainder="passthrough"
)

# Define the pipeline with Ridge Regression
ridge_pipeline = Pipeline(
    [
        ('preprocessing', ct),
        ('ridge', Ridge(max_iter=10000, random_state=42))
    ]
)

# Define the parameter grid for Ridge
param_grid = {
    'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'ridge__fit_intercept': [True, False],          # Whether to calculate the intercept
    'ridge__tol': [1e-4, 1e-3, 1e-2],              # Tolerance for optimization
    'ridge__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],  # Solvers
}

# Perform grid search with cross-validation
gscv = GridSearchCV(ridge_pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
gscv_fitted = gscv.fit(X, y)

# Extract the best parameters and scores
print("Best Parameters:", gscv_fitted.best_params_)
print("Best RMSE Score:", abs(gscv_fitted.best_score_))


Best Parameters: {'ridge__alpha': 10, 'ridge__fit_intercept': True, 'ridge__solver': 'sparse_cg', 'ridge__tol': 0.001}
Best RMSE Score: 0.14857858925547804


### Ridge model without 'totrms abvgrd'

In [19]:
# ridge regression without 1. totrms abvgrd
# ridge regression
from sklearn.linear_model import Ridge
X = house_clean.drop(['SalePrice', 'PID', 'LogSalePrice', 'TotRms AbvGrd'], axis=1)
y = house_clean['LogSalePrice']

# Define the ColumnTransformer for preprocessing
ct = ColumnTransformer(
    [
        ("dummify", 
         OneHotEncoder(sparse_output=False, handle_unknown='ignore'), 
         make_column_selector(dtype_include=object)),
        ("standardize", 
         StandardScaler(), 
         make_column_selector(dtype_include=np.number))
    ],
    remainder="passthrough"
)

# Define the pipeline with Ridge Regression
ridge_pipeline = Pipeline(
    [
        ('preprocessing', ct),
        ('ridge', Ridge(max_iter=10000, random_state=42))
    ]
)

# Define the parameter grid for Ridge
param_grid = {
    'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'ridge__fit_intercept': [True, False],          # Whether to calculate the intercept
    'ridge__tol': [1e-4, 1e-3, 1e-2],              # Tolerance for optimization
    'ridge__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],  # Solvers
}

# Perform grid search with cross-validation
gscv = GridSearchCV(ridge_pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
gscv_fitted = gscv.fit(X, y)

# Extract the best parameters and scores
print("Best Parameters:", gscv_fitted.best_params_)
print("Best RMSE Score:", abs(gscv_fitted.best_score_))


Best Parameters: {'ridge__alpha': 10, 'ridge__fit_intercept': True, 'ridge__solver': 'lsqr', 'ridge__tol': 0.001}
Best RMSE Score: 0.1484600298822546


### Ridge model with less predictor variables (SECOND BEST MODEL)

In [11]:
# ridge regression without totrms abvgrd and lot frontage and pool area
from sklearn.linear_model import Ridge
X = house_clean.drop(['SalePrice', 'PID', 'LogSalePrice', 'TotRms AbvGrd', 'Lot Frontage', 'Pool Area',], axis=1)
y = house_clean['LogSalePrice']

# Define the ColumnTransformer for preprocessing
ct = ColumnTransformer(
    [
        ("dummify", 
         OneHotEncoder(sparse_output=False, handle_unknown='ignore'), 
         make_column_selector(dtype_include=object)),
        ("standardize", 
         StandardScaler(), 
         make_column_selector(dtype_include=np.number))
    ],
    remainder="passthrough"
)

# Define the pipeline with Ridge Regression
best_ridge_pipeline = Pipeline(
    [
        ('preprocessing', ct),
        ('ridge', Ridge(max_iter=10000, random_state=42))
    ]
)

# Define the parameter grid for Ridge
param_grid = {
    'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'ridge__fit_intercept': [True, False],          # Whether to calculate the intercept
    'ridge__tol': [1e-4, 1e-3, 1e-2],              # Tolerance for optimization
    'ridge__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],  # Solvers
}

# Perform grid search with cross-validation
gscv = GridSearchCV(best_ridge_pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
gscv_fitted = gscv.fit(X, y)

# Extract the best parameters and scores
print("Best Parameters:", gscv_fitted.best_params_)
print("Best RMSE Score:", abs(gscv_fitted.best_score_))


Best Parameters: {'ridge__alpha': 10, 'ridge__fit_intercept': True, 'ridge__solver': 'sparse_cg', 'ridge__tol': 0.001}
Best RMSE Score: 0.14706169609551506


Getting prediction CSV for Kaggle submission.

In [12]:
# getting dataframe with predictions for this ^ model 
test_house_data = pd.read_csv("C:/Users/alexa/OneDrive/Documentos/VSCode Folder/GSB544_Computing_and_ML/Kaggle_Comp/Data/test_new.csv")
test_house_data 

best_ridge_pipeline.fit(X, y)


final_predictions = pd.DataFrame(
    {"PID": test_house_data['PID'],
    "SalePrice": best_ridge_pipeline.predict(test_house_data)}
)

final_predictions

Unnamed: 0,PID,SalePrice
0,907135180,11.770305
1,528181040,12.300574
2,528175010,12.300769
3,531379030,12.128099
4,923275090,11.764799
...,...,...
600,528174060,12.114607
601,903400180,12.050217
602,903227150,11.793525
603,909250070,11.967114


In [13]:
# exponentiating log values to get prediction in dollars
final_predictions['SalePrice'] = np.exp(final_predictions['SalePrice'])
final_predictions

Unnamed: 0,PID,SalePrice
0,907135180,129353.545104
1,528181040,219822.183821
2,528175010,219865.008445
3,531379030,184997.795732
4,923275090,128643.388511
...,...,...
600,528174060,182518.473143
601,903400180,171136.567154
602,903227150,132392.284439
603,909250070,157489.450688


## Ridge model with unimportant predictor variables and exponentiated 'Gr Liv Area' (BEST MODEL)
Here I decided to exponentiate Gr Liv Area to further standardize it. It would make sense that Gr Liv Area might not have a linear relationship with Sale Price since there might be diminishing OR accelerating returns on larger properties. Exponentiating this variable results in an exponentially larger or smaller impact on price depending on the observation. Additionally, Gr Liv Area has a right skewed distribution (as is common with size-related metrics like square feet) according to the Data Wrangler extension, which exponentiating helps to make more normal. These reasons are why this is the best model for predicting Sale Price. 

In [90]:
# ridge with polynomials
X = house_clean.drop(['SalePrice', 'PID', 'LogSalePrice', 'TotRms AbvGrd', 'Lot Frontage', 'Pool Area',], axis=1)
y = house_clean['LogSalePrice']

ct = ColumnTransformer(
    [
        ("dummify", 
         OneHotEncoder(sparse_output=False, handle_unknown='ignore'), 
         make_column_selector(dtype_include=object)),
        ("standardize", 
         StandardScaler(), 
         make_column_selector(dtype_include=np.number))
    ],
    remainder="passthrough"
).set_output(transform="pandas")

ct_poly = ColumnTransformer([
    ('squared', PolynomialFeatures(degree = 2, include_bias=False), ['standardize__Gr Liv Area'])], remainder='passthrough'
).set_output(transform="pandas")

# Define the pipeline with Ridge Regression
poly_ridge_pipeline = Pipeline(
    [
        ('preprocessing', ct),
        ('poly', ct_poly),
        ('ridge', Ridge(max_iter=10000, random_state=42))
    ]
)
# ct_poly.fit_transform(ct.fit_transform(X))


# Define the parameter grid for Ridge
param_grid = {
    'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'ridge__fit_intercept': [True, False],          # Whether to calculate the intercept
    'ridge__tol': [1e-4, 1e-3, 1e-2],              # Tolerance for optimization
    'ridge__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],  # Solvers
}

# Perform grid search with cross-validation
gscv = GridSearchCV(poly_ridge_pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
gscv_fitted = gscv.fit(X, y)

# Extract the best parameters and scores
print("Best Parameters:", gscv_fitted.best_params_)
print("Best RMSE Score:", abs(gscv_fitted.best_score_))


Best Parameters: {'ridge__alpha': 10, 'ridge__fit_intercept': True, 'ridge__solver': 'lsqr', 'ridge__tol': 0.001}
Best RMSE Score: 0.13528459066091847


In [92]:
# getting dataframe with predictions for this ^ model 
test_house_data = pd.read_csv("C:/Users/alexa/OneDrive/Documentos/VSCode Folder/GSB544_Computing_and_ML/Kaggle_Comp/Data/test_new.csv")
test_house_data 

poly_ridge_pipeline.fit(X, y)


final_ridgepoly_predictions = pd.DataFrame(
    {"PID": test_house_data['PID'],
    "SalePrice": poly_ridge_pipeline.predict(test_house_data)}
)

final_ridgepoly_predictions

Unnamed: 0,PID,SalePrice
0,907135180,11.727208
1,528181040,12.305747
2,528175010,12.302247
3,531379030,12.163736
4,923275090,11.697161
...,...,...
600,528174060,12.128654
601,903400180,12.071290
602,903227150,11.775040
603,909250070,12.007623


In [93]:
# exponentiating log values to get prediction in dollars
final_ridgepoly_predictions['SalePrice'] = np.exp(final_ridgepoly_predictions['SalePrice'])
final_ridgepoly_predictions

Unnamed: 0,PID,SalePrice
0,907135180,123897.250511
1,528181040,220962.172153
2,528175010,220190.289025
3,531379030,191709.452575
4,923275090,120229.894498
...,...,...
600,528174060,185100.488474
601,903400180,174781.210486
602,903227150,129967.536821
603,909250070,164000.278748


### Additional ridge regressions

In [22]:
# ridge regression without totrms abvgrd and lot frontage and pool area and half bath
from sklearn.linear_model import Ridge
X = house_clean.drop(['SalePrice', 'PID', 'LogSalePrice', 'TotRms AbvGrd', 'Lot Frontage', 'Pool Area', 'Half Bath'], axis=1)
y = house_clean['LogSalePrice']

# Define the ColumnTransformer for preprocessing
ct = ColumnTransformer(
    [
        ("dummify", 
         OneHotEncoder(sparse_output=False, handle_unknown='ignore'), 
         make_column_selector(dtype_include=object)),
        ("standardize", 
         StandardScaler(), 
         make_column_selector(dtype_include=np.number))
    ],
    remainder="passthrough"
)

# Define the pipeline with Ridge Regression
ridge_pipeline = Pipeline(
    [
        ('preprocessing', ct),
        ('ridge', Ridge(max_iter=10000, random_state=42))
    ]
)

# Define the parameter grid for Ridge
param_grid = {
    'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'ridge__fit_intercept': [True, False],          # Whether to calculate the intercept
    'ridge__tol': [1e-4, 1e-3, 1e-2],              # Tolerance for optimization
    'ridge__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],  # Solvers
}

# Perform grid search with cross-validation
gscv = GridSearchCV(ridge_pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
gscv_fitted = gscv.fit(X, y)

# Extract the best parameters and scores
print("Best Parameters:", gscv_fitted.best_params_)
print("Best RMSE Score:", abs(gscv_fitted.best_score_))


Best Parameters: {'ridge__alpha': 10, 'ridge__fit_intercept': True, 'ridge__solver': 'sparse_cg', 'ridge__tol': 0.001}
Best RMSE Score: 0.14716252201866173


In [23]:
# ridge regression without totrms abvgrd and lot frontage and half bath
from sklearn.linear_model import Ridge
X = house_clean.drop(['SalePrice', 'PID', 'LogSalePrice', 'TotRms AbvGrd', 'Lot Frontage', 'Half Bath'], axis=1)
y = house_clean['LogSalePrice']

# Define the ColumnTransformer for preprocessing
ct = ColumnTransformer(
    [
        ("dummify", 
         OneHotEncoder(sparse_output=False, handle_unknown='ignore'), 
         make_column_selector(dtype_include=object)),
        ("standardize", 
         StandardScaler(), 
         make_column_selector(dtype_include=np.number))
    ],
    remainder="passthrough"
)

# Define the pipeline with Ridge Regression
ridge_pipeline = Pipeline(
    [
        ('preprocessing', ct),
        ('ridge', Ridge(max_iter=10000, random_state=42))
    ]
)

# Define the parameter grid for Ridge
param_grid = {
    'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'ridge__fit_intercept': [True, False],          # Whether to calculate the intercept
    'ridge__tol': [1e-4, 1e-3, 1e-2],              # Tolerance for optimization
    'ridge__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],  # Solvers
}

# Perform grid search with cross-validation
gscv = GridSearchCV(ridge_pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
gscv_fitted = gscv.fit(X, y)

# Extract the best parameters and scores
print("Best Parameters:", gscv_fitted.best_params_)
print("Best RMSE Score:", abs(gscv_fitted.best_score_))


Best Parameters: {'ridge__alpha': 10, 'ridge__fit_intercept': True, 'ridge__solver': 'lsqr', 'ridge__tol': 0.001}
Best RMSE Score: 0.14794094944083272


### Lasso regression attempt

In [59]:
# lasso regression
from sklearn.linear_model import Lasso

X = house_clean.drop(['SalePrice', 'PID', 'LogSalePrice', 'TotRms AbvGrd', 'Yr Sold', 'Half Bath'], axis = 1)
y = house_clean['LogSalePrice']

ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lasso_pipeline = Pipeline(
    [
        ('preprocessing', ct),
        ('lasso', Lasso(max_iter=10000, random_state=42))
    ]
)

param_grid = {
    'lasso__alpha': [0.001, 0.01, 0.1, 1, 10],   # Regularization strength
    'lasso__fit_intercept': [True, False],       # Whether to calculate the intercept
    'lasso__tol': [1e-4, 1e-3, 1e-2],           # Tolerance for optimization
    'lasso__selection': ['cyclic', 'random'],    # Coordinate descent strategy
}

# Perform grid search with cross-validation
gscv = GridSearchCV(lasso_pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
gscv_fitted = gscv.fit(X, y)

# Extract the best parameters and scores
print("Best Parameters:", gscv_fitted.best_params_)
print("Best RMSE Score:", abs(gscv_fitted.best_score_))

Best Parameters: {'lasso__alpha': 0.001, 'lasso__fit_intercept': True, 'lasso__selection': 'random', 'lasso__tol': 0.01}
Best RMSE Score: 0.1512820204376578


### Coefficient analysis for aboce lasso refression

In [60]:
# Fit the pipeline to the data
lasso_pipeline.fit(X, y)

# Get feature names after preprocessing
feature_names = lasso_pipeline.named_steps["preprocessing"].get_feature_names_out()

# Get coefficients from the Ridge model
lasso_coefs = lasso_pipeline.named_steps["lasso"].coef_

# Map coefficients to their corresponding feature names
coefficients = dict(zip(feature_names, lasso_coefs))

# Create a DataFrame for the coefficients
coefficients_df = pd.DataFrame(list(coefficients.items()), columns=['Feature', 'Coefficient'])

# Sort coefficients by absolute value for better readability
coefficients_df['Absolute_Coefficient'] = coefficients_df['Coefficient'].abs()
coefficients_df = coefficients_df.sort_values(by='Absolute_Coefficient', ascending=False)

# Display the sorted coefficients
coefficients_df

Unnamed: 0,Feature,Coefficient,Absolute_Coefficient
0,dummify__Street_Grvl,-0.0,0.0
1,dummify__Street_Pave,0.0,0.0
2,dummify__Neighborhood_Blmngtn,0.0,0.0
3,dummify__Neighborhood_Blueste,-0.0,0.0
4,dummify__Neighborhood_BrDale,-0.0,0.0
...,...,...,...
85,standardize__Full Bath,0.0,0.0
86,standardize__Bedroom AbvGr,0.0,0.0
87,standardize__Gr Liv Area,0.0,0.0
88,standardize__Screen Porch,0.0,0.0


### Elastic Net Model

In [50]:
# elastic net model
from sklearn.linear_model import ElasticNet

X = house_clean.drop(['SalePrice', 'PID', 'LogSalePrice', 'Lot Frontage', 'Pool Area', 'TotRms AbvGrd'], axis = 1)
y = house_clean['LogSalePrice']

ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

en_pipeline = Pipeline(
    [
        ('preprocessing', ct),
        ('elastic', ElasticNet())  # Adjust max_iter for convergence
    ]
)

# Define the parameter grid for ElasticNet
param_grid = {
    'elastic__alpha': [0.0001, 0.001], # Regularization strength
    'elastic__l1_ratio': [0.1, 0.2],   # Mix of L1 and L2 penalties
    'elastic__copy_X': [True, False],                # Whether to copy the input data
    'elastic__positive': [True, False],              # Restrict coefficients to be positive
    'elastic__selection': ['cyclic', 'random'],      # Coordinate descent strategy
    'elastic__tol': [1e-4, 1e-3, 1e-2, 0.01],              # Tolerance for optimization
}

# Perform grid search with cross-validation
gscv = GridSearchCV(en_pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
gscv_fitted = gscv.fit(X, y)

# Extract the best parameters and scores
print("Best Parameters:", gscv_fitted.best_params_)
print("Best RMSE Score:", abs(gscv_fitted.best_score_))

Best Parameters: {'elastic__alpha': 0.001, 'elastic__copy_X': True, 'elastic__l1_ratio': 0.2, 'elastic__positive': False, 'elastic__selection': 'random', 'elastic__tol': 0.01}
Best RMSE Score: 0.14695717979195771


In [None]:
final_predictions['SalePrice'] = np.exp(final_predictions['SalePrice'])

In [59]:
final_predictions

Unnamed: 0,PID,SalePrice
0,907135180,129682.268351
1,528181040,220706.445030
2,528175010,220149.167426
3,531379030,185343.966445
4,923275090,130231.243918
...,...,...
600,528174060,182910.859413
601,903400180,171021.576376
602,903227150,131768.913644
603,909250070,158555.195464


### Reference Statement:
ChatGPT4o assisted me in this assignment with finding coefficient values for the models in which I thought it would be useful to see coefficients for dimension reduction. It also helped with setting up a model to find predicted values for missing 'Lot Frontage' values to create a cleaned dataset for predicting SalePrice in the house dataset. 