# Linear Regression
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/andrychowanda/COMP6577/blob/master/COMP6577-3.ipynb)

Import required packages

In [0]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sb
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


Load and Profile the Data

In [None]:
!wget https://raw.githubusercontent.com/andrychowanda/COMP6577/master/FuelConsumptionCo2.csv

In [94]:
fuelData = pd.read_csv("FuelConsumptionCo2.csv")
print(fuelData.dtypes)
print(fuelData.isna().values.any())

MODELYEAR                     int64
MAKE                         object
MODEL                        object
VEHICLECLASS                 object
ENGINESIZE                  float64
CYLINDERS                     int64
TRANSMISSION                 object
FUELTYPE                     object
FUELCONSUMPTION_CITY        float64
FUELCONSUMPTION_HWY         float64
FUELCONSUMPTION_COMB        float64
FUELCONSUMPTION_COMB_MPG      int64
CO2EMISSIONS                  int64
dtype: object
False


Create new Data Frame with selected Features

In [95]:
newFuelData = fuelData[["CYLINDERS", "ENGINESIZE", "CO2EMISSIONS", "FUELCONSUMPTION_COMB", "FUELCONSUMPTION_CITY", "FUELCONSUMPTION_HWY", "FUELCONSUMPTION_COMB_MPG"]]
print(newFuelData.head())

   CYLINDERS  ENGINESIZE  ...  FUELCONSUMPTION_HWY  FUELCONSUMPTION_COMB_MPG
0          4         2.0  ...                  6.7                        33
1          4         2.4  ...                  7.7                        29
2          4         1.5  ...                  5.8                        48
3          6         3.5  ...                  9.1                        25
4          6         3.5  ...                  8.7                        27

[5 rows x 7 columns]


Train the data using linear regression

In [96]:
train, test = train_test_split(newFuelData, test_size=0.2)
regression = linear_model.LinearRegression()
regression.fit(train[["ENGINESIZE"]], train[["CO2EMISSIONS"]])
print('Coefficients: ', regression.coef_)
print('Intercept: ',regression.intercept_)

Coefficients:  [[38.97748989]]
Intercept:  [125.55568676]


Evaluate the model

In [97]:
prediction = regression.predict(test[["ENGINESIZE"]])
for i in range(len(test)):
  print(test[["ENGINESIZE"]].values[i], prediction[i])

print("MAE : ", mean_absolute_error(test[["CO2EMISSIONS"]], prediction))
print("MSE : ", mean_squared_error(test[["CO2EMISSIONS"]], prediction))
print("R2 : ", r2_score(test[["CO2EMISSIONS"]], prediction))


[3.8] [273.67014835]
[1.4] [180.12417261]
[4.8] [312.64763824]
[3.6] [265.87465037]
[6.8] [390.60261802]
[3.5] [261.97690138]
[2.] [203.51066654]
[4.] [281.46564632]
[3.7] [269.77239936]
[1.8] [195.71516856]
[3.6] [265.87465037]
[3.5] [261.97690138]
[3.6] [265.87465037]
[5.3] [332.13638318]
[1.4] [180.12417261]
[3.6] [265.87465037]
[2.] [203.51066654]
[1.4] [180.12417261]
[2.1] [207.40841553]
[3.] [242.48815643]
[3.5] [261.97690138]
[3.6] [265.87465037]
[1.5] [184.0219216]
[2.] [203.51066654]
[3.5] [261.97690138]
[2.5] [222.99941149]
[2.5] [222.99941149]
[2.4] [219.1016625]
[4.3] [293.15889329]
[2.4] [219.1016625]
[3.5] [261.97690138]
[5.3] [332.13638318]
[3.6] [265.87465037]
[4.4] [297.05664228]
[4.4] [297.05664228]
[2.] [203.51066654]
[3.] [242.48815643]
[5.3] [332.13638318]
[1.4] [180.12417261]
[3.] [242.48815643]
[3.6] [265.87465037]
[3.6] [265.87465037]
[5.3] [332.13638318]
[3.5] [261.97690138]
[3.6] [265.87465037]
[2.5] [222.99941149]
[2.] [203.51066654]
[6.2] [367.21612409]
[4.]

# Classification - KNN

Import all required packages

In [0]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
from sklearn import preprocessing
from sklearn.model_selection import train_test_split


Load & profile the data

In [None]:
!wget https://raw.githubusercontent.com/andrychowanda/COMP6577/master/titanic_train.csv

In [99]:
dataTitanic = pd.read_csv("titanic_train.csv")

for i in dataTitanic:
  print(i, dataTitanic[i].dtypes)
  if dataTitanic[i].isna().values.any() and dataTitanic[i].dtypes != object:
      dataTitanic[i] = dataTitanic[i].fillna(dataTitanic[i].mean())

train, test = train_test_split(dataTitanic, test_size=0.2)


passenger_id int64
pclass int64
name object
sex object
age float64
sibsp int64
parch int64
ticket object
fare float64
cabin object
embarked object
boat object
body float64
home.dest object
survived int64


Training Proces

In [0]:
KNN = KNeighborsClassifier(n_neighbors = 3).fit(train[["age", "fare"]], train["survived"])

Clasify new data & evaluate the model

In [101]:
newClassification = KNN.predict([[17,8],[8,10]])
print(newClassification)

classification = KNN.predict(test[["age", "fare"]])
accuracy = accuracy_score(test["survived"], classification)
MAE = mean_absolute_error(test["survived"], classification)
MSE = mean_squared_error(test["survived"], classification)

print(" ACC : %.2f" % accuracy)
print(" MAE : %.2f" % MAE)
print(" MSE : %.2f" % MSE)


[0 1]
 ACC : 0.65
 MAE : 0.35
 MSE : 0.35


Identify the best K

In [102]:
Ks = 10
accuracy = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1, Ks):    
    KNN = KNeighborsClassifier(n_neighbors = n).fit(train[["age", "fare"]], train["survived"])  
    classification = KNN.predict(test[["age", "fare"]])
    accuracy[n - 1] = accuracy_score(test["survived"], classification)
    
print("Best  ACC : %.2f" % accuracy.max(), ", with k = ", accuracy.argmax() + 1)


Best  ACC : 0.71 , with k =  9


# Classification - Logistic Regression

Use the same process of Import all required packages and Load & profile the data with Classification - KNN Section above. With an addition of sklearn.linear_models (Logistic) and some metrics from sklearn.metrics packages

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score

Train with Logistic Regression

In [0]:
LGR = LogisticRegression(C = 0.01, solver = 'liblinear').fit(train[["age", "fare"]], train["survived"]) 

Clasify new data & evaluate the model (plus F1 and classification report)

In [105]:
newClassification = LGR.predict([[17,8],[8,10]])
print(newClassification)

classification = LGR.predict(test[["age", "fare"]])
accuracy = accuracy_score(test["survived"], classification)
MAE = mean_absolute_error(test["survived"], classification)
MSE = mean_squared_error(test["survived"], classification)
F1  = f1_score(test["survived"], classification, average='weighted') 
cnf_matrix = confusion_matrix(test["survived"], classification, labels=[0,1])


print(" ACC : %.2f" % accuracy)
print(" MAE : %.2f" % MAE)
print(" MSE : %.2f" % MSE)
print(" F1  : %.2f" % F1)
print (classification_report(test["survived"], classification))
print(cnf_matrix)

[0 0]
 ACC : 0.69
 MAE : 0.31
 MSE : 0.31
 F1  : 0.65
              precision    recall  f1-score   support

           0       0.69      0.94      0.80       109
           1       0.70      0.26      0.38        61

    accuracy                           0.69       170
   macro avg       0.69      0.60      0.59       170
weighted avg       0.69      0.69      0.65       170

[[102   7]
 [ 45  16]]


Probability each class

In [106]:
classificationProb = LGR.predict_proba(test[["age", "fare"]])
print(classificationProb)

[[0.69007147 0.30992853]
 [0.69170772 0.30829228]
 [0.54208107 0.45791893]
 [0.43746036 0.56253964]
 [0.65844147 0.34155853]
 [0.46460229 0.53539771]
 [0.72289824 0.27710176]
 [0.71649002 0.28350998]
 [0.60637022 0.39362978]
 [0.70378062 0.29621938]
 [0.63095271 0.36904729]
 [0.7164184  0.2835816 ]
 [0.61834738 0.38165262]
 [0.52140729 0.47859271]
 [0.66904708 0.33095292]
 [0.72305135 0.27694865]
 [0.57430777 0.42569223]
 [0.69007147 0.30992853]
 [0.42018466 0.57981534]
 [0.64991475 0.35008525]
 [0.75712645 0.24287355]
 [0.50003476 0.49996524]
 [0.67323782 0.32676218]
 [0.68579719 0.31420281]
 [0.69007147 0.30992853]
 [0.59896215 0.40103785]
 [0.68693549 0.31306451]
 [0.69007147 0.30992853]
 [0.72881569 0.27118431]
 [0.86815741 0.13184259]
 [0.68077662 0.31922338]
 [0.68987553 0.31012447]
 [0.67693737 0.32306263]
 [0.70269272 0.29730728]
 [0.74197572 0.25802428]
 [0.47512368 0.52487632]
 [0.52171805 0.47828195]
 [0.55573426 0.44426574]
 [0.64176081 0.35823919]
 [0.6897107  0.3102893 ]


# Classification - Decision Tree

Use the same process of Import all required packages and Load & profile the data with Classification - KNN Section above. With an addition of sklearn.tree (DecisionTree)

In [0]:
from sklearn.tree import DecisionTreeClassifier

Training Process

In [0]:
DST = DecisionTreeClassifier(criterion="gini", max_depth = 10).fit(train[["age", "fare"]], train["survived"]) 

Clasify new data & evaluate the model

In [109]:
newClassification = DST.predict([[17,8],[8,10]])
print(newClassification)

classification = DST.predict(test[["age", "fare"]])
accuracy = accuracy_score(test["survived"], classification)
MAE = mean_absolute_error(test["survived"], classification)
MSE = mean_squared_error(test["survived"], classification)
F1  = f1_score(test["survived"], classification, average='weighted') 
cnf_matrix = confusion_matrix(test["survived"], classification, labels=[0,1])


print(" ACC : %.2f" % accuracy)
print(" MAE : %.2f" % MAE)
print(" MSE : %.2f" % MSE)
print(" F1  : %.2f" % F1)
print (classification_report(test["survived"], classification))
print(cnf_matrix)

[0 0]
 ACC : 0.65
 MAE : 0.35
 MSE : 0.35
 F1  : 0.65
              precision    recall  f1-score   support

           0       0.73      0.73      0.73       109
           1       0.52      0.51      0.51        61

    accuracy                           0.65       170
   macro avg       0.62      0.62      0.62       170
weighted avg       0.65      0.65      0.65       170

[[80 29]
 [30 31]]


# Classification - SVM

Use the same process of Import all required packages and Load & profile the data with Classification - KNN Section above. With an addition of sklearn.svm

In [0]:
from sklearn import svm

Training Process

In [0]:
SVM = svm.SVC().fit(train[["age", "fare"]], train["survived"]) 

Clasify new data & evaluate the model

In [112]:
newClassification = SVM.predict([[17,8],[8,10]])
print(newClassification)

classification = SVM.predict(test[["age", "fare"]])
accuracy = accuracy_score(test["survived"], classification)
MAE = mean_absolute_error(test["survived"], classification)
MSE = mean_squared_error(test["survived"], classification)
F1  = f1_score(test["survived"], classification, average='weighted') 
cnf_matrix = confusion_matrix(test["survived"], classification, labels=[0,1])


print(" ACC : %.2f" % accuracy)
print(" MAE : %.2f" % MAE)
print(" MSE : %.2f" % MSE)
print(" F1  : %.2f" % F1)
print (classification_report(test["survived"], classification))
print(cnf_matrix)

[0 0]
 ACC : 0.72
 MAE : 0.28
 MSE : 0.28
 F1  : 0.68
              precision    recall  f1-score   support

           0       0.71      0.96      0.82       109
           1       0.82      0.30      0.43        61

    accuracy                           0.72       170
   macro avg       0.76      0.63      0.63       170
weighted avg       0.75      0.72      0.68       170

[[105   4]
 [ 43  18]]


# Classification - Additional Material

Import required packages

In [0]:
import pandas as pd
import numpy as np
from sklearn import svm, preprocessing
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split

Pre-processing

In [114]:
dataTitanic = pd.read_csv("titanic_train.csv")

for i in dataTitanic:
  print(i, dataTitanic[i].dtypes)
  if dataTitanic[i].isna().values.any() and dataTitanic[i].dtypes != object:
      dataTitanic[i] = dataTitanic[i].fillna(dataTitanic[i].mean())



passenger_id int64
pclass int64
name object
sex object
age float64
sibsp int64
parch int64
ticket object
fare float64
cabin object
embarked object
boat object
body float64
home.dest object
survived int64


Preprocess - encode non real values

In [0]:
X = dataTitanic[["age", "fare", "sex"]].values
encode = preprocessing.LabelEncoder().fit(['male','female'])
X[:,2] = encode.transform(X[:,2]) 
Y = dataTitanic["survived"]

XTrain, XTest, YTrain, YTest = train_test_split(X,Y, test_size=0.2)

Clasify new data & evaluate the model - SVM

In [116]:
SVM = svm.SVC().fit(XTrain, YTrain) 
newClassification = SVM.predict([[17,8,1],[8,10,0]])
print(newClassification)

classification = SVM.predict(XTest)
accuracy = accuracy_score(YTest, classification)
MAE = mean_absolute_error(YTest, classification)
MSE = mean_squared_error(YTest, classification)
F1  = f1_score(YTest, classification, average='weighted') 
cnf_matrix = confusion_matrix(YTest, classification, labels=[0,1])


print(" ACC : %.2f" % accuracy)
print(" MAE : %.2f" % MAE)
print(" MSE : %.2f" % MSE)
print(" F1  : %.2f" % F1)
print (classification_report(YTest, classification))
print(cnf_matrix)

[0 0]
 ACC : 0.66
 MAE : 0.34
 MSE : 0.34
 F1  : 0.62
              precision    recall  f1-score   support

           0       0.68      0.89      0.77       108
           1       0.57      0.26      0.36        62

    accuracy                           0.66       170
   macro avg       0.62      0.57      0.56       170
weighted avg       0.64      0.66      0.62       170

[[96 12]
 [46 16]]


Clasify new data & evaluate the model - Decision Tree

In [117]:
DST = DecisionTreeClassifier(criterion="gini", max_depth = 10).fit(XTrain, YTrain) 
newClassification = SVM.predict([[17,8,1],[8,10,0]])
print(newClassification)

classification = DST.predict(XTest)
accuracy = accuracy_score(YTest, classification)
MAE = mean_absolute_error(YTest, classification)
MSE = mean_squared_error(YTest, classification)
F1  = f1_score(YTest, classification, average='weighted') 
cnf_matrix = confusion_matrix(YTest, classification, labels=[0,1])


print(" ACC : %.2f" % accuracy)
print(" MAE : %.2f" % MAE)
print(" MSE : %.2f" % MSE)
print(" F1  : %.2f" % F1)
print (classification_report(YTest, classification))
print(cnf_matrix)

[0 0]
 ACC : 0.71
 MAE : 0.29
 MSE : 0.29
 F1  : 0.72
              precision    recall  f1-score   support

           0       0.80      0.72      0.76       108
           1       0.59      0.69      0.64        62

    accuracy                           0.71       170
   macro avg       0.70      0.71      0.70       170
weighted avg       0.73      0.71      0.72       170

[[78 30]
 [19 43]]
