# MSA 2023 Phase 2 - Part 2

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Classification Model

## 1. Load and split preprocessed data

In [14]:
# Load preprocessed data
market_segmentation = pd.read_csv("../1. Analysis and Preprocessing/preprocessed_datasets/market_segmentation.csv")
market_segmentation.head()

Unnamed: 0,Age,Family_Size,Segmentation,Graduated_Yes,Ever_Married_Yes,Spending_Score_Average,Spending_Score_High,Spending_Score_Low,Profession_Artist,Profession_Doctor,...,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing,Age_Segment_Adult,Age_Segment_Elderly,Age_Segment_Middle_Aged,Age_Adult,Age_Middle_Aged,Age_Elderly
0,0.056338,0.375,D,0,0,0,0,1,0,0,...,1,0,0,0,1,0,0,0.056338,0.0,0.0
1,0.28169,0.25,A,1,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0.28169,0.0,0.0
2,0.690141,0.0,B,1,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0.0,0.0,0.690141
3,0.690141,0.125,B,1,1,0,1,0,0,0,...,0,0,1,0,0,1,0,0.0,0.0,0.690141
4,0.309859,0.625,A,1,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0.309859,0.0,0.0


In [15]:
# Get the features of the dataset
X = market_segmentation.drop(columns=["Segmentation"])

# Get the target of the dataset
y = market_segmentation.Segmentation

# Create an 70-30 train-test split on the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

## 2. Choose an algorithm

In [16]:
# Load an untuned logistic regression model
lg_model = LogisticRegression()

# Load an untuned random forest classifier model
rf_model = RandomForestClassifier()

## 3. Train and test a model

In [17]:
# Train the logistic regression model using the training set
lg_model.fit(X_train, y_train)

# Get predictions using the features of the test set
lg_predictions = lg_model.predict(X_test)

# Test the model using predictions and the targets of the test set
lg_accuracy = accuracy_score(y_test, lg_predictions)

lg_accuracy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5176071277047094

In [18]:
# Train the random forest model using the training set
rf_model.fit(X_train, y_train)

# Get predictions using the features of the test set
rf_predictions = rf_model.predict(X_test)

# Test the model using predictions and the targets of the test set
rf_accuracy = accuracy_score(y_test, rf_predictions)

rf_accuracy

0.4700890963088672

## 4. Evaluate the model 

Please refer to [Evaluating Classification Models](https://github.com/NZMSA/2023-Phase-2/blob/main/data-science/0.%20Resources/docs/evaluating-classification-models.md)

In [19]:
# Evaluate the confusion matrix of the logistic regression model
confusion_matrix(y_test, lg_predictions)

array([[286,  84,  89, 105],
       [160, 146, 184,  63],
       [ 78, 102, 318,  82],
       [140,  23,  27, 470]], dtype=int64)

In [20]:
# Evaluate the confusion matrix of the random forest model
confusion_matrix(y_test, rf_predictions)

array([[212, 132,  95, 125],
       [125, 175, 168,  85],
       [ 83, 128, 300,  69],
       [134,  54,  51, 421]], dtype=int64)

## 5. Summary

# Regression Model

## 1. Load and split preprocessed data

In [21]:
# Load preprocessed data
exam_scores = pd.read_csv("../1. Analysis and Preprocessing/preprocessed_datasets/exam_scores.csv")
exam_scores.head()

Unnamed: 0,WklyStudyHours,MathScore,ReadingScore,WritingScore,LunchType_standard,TestPrep_none,Gender_male,ParentEduc_associate's degree,ParentEduc_bachelor's degree,ParentEduc_high school,ParentEduc_master's degree,ParentEduc_some college,ParentEduc_some high school,EthnicGroup_group A,EthnicGroup_group B,EthnicGroup_group C,EthnicGroup_group D,EthnicGroup_group E
0,0,71,71,74,1,1,0,0,1,0,0,0,0,0,0,0,0,0
1,0,87,93,91,1,1,0,0,0,0,1,0,0,0,1,0,0,0
2,1,45,56,42,0,1,1,1,0,0,0,0,0,1,0,0,0,0
3,1,76,78,75,1,1,1,0,0,0,0,1,0,0,0,1,0,0
4,1,73,84,79,1,1,0,1,0,0,0,0,0,0,1,0,0,0


In [22]:
# Get the features of the dataset
X = exam_scores.drop(columns=["MathScore", "ReadingScore", "WritingScore"])

# Get the target of the dataset
y = exam_scores.MathScore

# Create an 70-30 train-test split on the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

## 2. Choose an algorithm

In [23]:
# Load an untuned linear regression model
lr_model = LinearRegression()

## 3. Train and test a model

In [24]:
# Train the linear regression model using the training set
lr_model.fit(X_train, y_train)

# Get predictions using the features of the test set
lr_predictions = lr_model.predict(X_test)

# Test the model using predictions and the targets of the test set
lr_mse = mean_squared_error(y_test, lr_predictions)

lr_mse

168.76321727312086

## 4. Evaluate the model

Please refer to [Evaluating Regression Models](https://github.com/NZMSA/2023-Phase-2/blob/main/data-science/0.%20Resources/docs/evaluating-regression-models.md)

## 5. Summary