# CA3

### Imports

In [102]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier


### Reading data

In [103]:
df = pd.read_csv("assets/train.csv", index_col=0) # First column as row index

### Data exploration and visualisation

In [104]:
# Visualize the data
"""
sns.pairplot(df, hue="Edible")
plt.tight_layout()
plt.show()
"""

print(df.describe())


       Acoustic Firmness Index  Atmospheric Pressure at Harvest (Pa)  \
count              1245.000000                           1248.000000   
mean                 21.570077                         101327.543269   
std                   8.131888                           4772.582203   
min                   0.600000                          83825.000000   
25%                  17.300000                          98095.750000   
50%                  21.300000                         101357.000000   
75%                  25.300000                         104470.750000   
max                 156.520701                         115636.000000   

       Bitterness Scale  Circumference (mm)  Color Intensity (a.u.)  \
count       1244.000000         1246.000000             1248.000000   
mean           1.808682          211.046062               63.300962   
std            1.960279           24.652278                7.118135   
min            0.000000          145.867667               46.060000

### Data cleaning

In [105]:
# Check for missing data
print(df.isnull().values.any()) # print true if it is missing data, since it is false, there is no missing data


#df = df.drop('Find Distance from Main Vulcano (km)', axis=1) 
#df = df.drop('Atmospheric Pressure at Harvest (Pa)', axis=1) 
#df = df.drop('Magnetic orientation (degree)', axis=1) 
#df = df.drop('Soil pH where Grown', axis=1) 

columns = df.columns
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') # other popular choices: "median", "most_frequent"
imputer.fit(df.values)
df = imputer.transform(df.values)
df = pd.DataFrame(df, columns=columns)


for i in df.columns:
    z_scores = (df[i] - np.mean(df[i])) / np.std(df[i])
    df_clean = df[np.abs(z_scores) < 1]


True


### Data preprocessing and visualisation

In [106]:
y = df["Edible"]
X = df.drop(columns="Edible")

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

sc = StandardScaler()
sc.fit(X_train)

X_train_sc = sc.transform(X_train)
X_test_sc = sc.transform(X_test)

### Modelling

In [97]:

"""
for i in range(10, 1000):
    ppn = Perceptron(max_iter=i, eta0=0.0001, random_state=0)
    ppn.fit(X_train_sc, y_train)
    y_pred = ppn.predict(X_test_sc)
    print("pp", accuracy_score(y_test, y_pred), i)

for i in range(10, 1000):
    lg = LogisticRegression(max_iter=i, penalty=None)
    lg.fit(X_train_sc, y_train)
    y_pred = lg.predict(X_test_sc)
    print("lg", accuracy_score(y_test, y_pred), i)

for i in range(10, 1000):
    lg = LogisticRegression(max_iter=i, penalty="l2")
    lg.fit(X_train_sc, y_train)
    y_pred = lg.predict(X_test_sc)
    print("lg", accuracy_score(y_test, y_pred), i, "pen")

for i in range(3, 200):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train_sc, y_train)
    y_pred = knn.predict(X_test_sc)
    print("knn", accuracy_score(y_test, y_pred), i)

for i in range(1, 100):
    dtc = DecisionTreeClassifier(max_depth=1000, criterion='gini', min_samples_leaf=j)
    dtc.fit(X_train_sc, y_train)
    y_pred = dtc.predict(X_test_sc)
    print("dtc", accuracy_score(y_test, y_pred), i, j)

for i in range(1, 100):
    dtc = DecisionTreeClassifier(max_depth=1000, criterion='gini', min_samples_leaf=j)
    dtc.fit(X_train, y_train)
    y_pred = dtc.predict(X_test)
    print("dtc", accuracy_score(y_test, y_pred), i)

"""

for i in range(1, 50):
    rf = RandomForestClassifier(n_estimators=i, random_state=0) #37, 22
    rf.fit(X_train_sc, y_train)
    y_pred = rf.predict(X_test_sc)
    print("rf", accuracy_score(y_test, y_pred), i)


"""
svmlin = svm.SVC(kernel="linear", C=1000)
svmlin.fit(X_train, y_train)
print(svmlin.score(X_test, y_test))

svmpol = svm.SVC(kernel="poly", degree=5, C=1000)
svmpol.fit(X_train, y_train)
print(svmpol.score(X_test, y_test))

svm.SVC(kernel="rbf", gamma=0.3, C=1000)
svmpol.fit(X_train, y_train)
print(svmpol.score(X_test, y_test))
"""

rf 0.842948717948718 1
rf 0.8301282051282052 2
rf 0.8878205128205128 3
rf 0.8814102564102564 4
rf 0.8974358974358975 5
rf 0.8878205128205128 6
rf 0.8942307692307693 7
rf 0.8878205128205128 8
rf 0.9038461538461539 9
rf 0.9038461538461539 10
rf 0.8974358974358975 11
rf 0.907051282051282 12
rf 0.9038461538461539 13
rf 0.907051282051282 14
rf 0.907051282051282 15
rf 0.9038461538461539 16
rf 0.9038461538461539 17
rf 0.9102564102564102 18
rf 0.907051282051282 19
rf 0.907051282051282 20
rf 0.9102564102564102 21
rf 0.9102564102564102 22
rf 0.907051282051282 23
rf 0.9102564102564102 24
rf 0.9102564102564102 25
rf 0.9102564102564102 26
rf 0.9102564102564102 27
rf 0.9102564102564102 28
rf 0.9102564102564102 29
rf 0.9102564102564102 30
rf 0.9102564102564102 31
rf 0.9102564102564102 32
rf 0.9102564102564102 33
rf 0.907051282051282 34
rf 0.907051282051282 35
rf 0.907051282051282 36
rf 0.907051282051282 37
rf 0.907051282051282 38
rf 0.907051282051282 39
rf 0.9038461538461539 40
rf 0.907051282051282 4

'\nsvmlin = svm.SVC(kernel="linear", C=1000)\nsvmlin.fit(X_train, y_train)\nprint(svmlin.score(X_test, y_test))\n\nsvmpol = svm.SVC(kernel="poly", degree=5, C=1000)\nsvmpol.fit(X_train, y_train)\nprint(svmpol.score(X_test, y_test))\n\nsvm.SVC(kernel="rbf", gamma=0.3, C=1000)\nsvmpol.fit(X_train, y_train)\nprint(svmpol.score(X_test, y_test))\n'

### Final evaluation

In [107]:
df = pd.read_csv("assets/train.csv", index_col=0) # First column as row index


#df = df.drop('Find Distance from Main Vulcano (km)', axis=1) 
#df = df.drop('Atmospheric Pressure at Harvest (Pa)', axis=1) 
#df = df.drop('Magnetic orientation (degree)', axis=1) 
#df = df.drop('Soil pH where Grown', axis=1) 

columns = df.columns
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') # other popular choices: "median", "most_frequent"
imputer.fit(df.values)
df = imputer.transform(df.values)
df = pd.DataFrame(df, columns=columns)

for i in df.columns:
    z_scores = (df[i] - np.mean(df[i])) / np.std(df[i])
    df_clean = df[np.abs(z_scores) < 1]


df_y = df["Edible"]
df_x = df.drop(columns="Edible")

sc = StandardScaler()
sc.fit(df_x)

df_x_sc = sc.transform(df_x)

#knn = KNeighborsClassifier(n_neighbors=6)
#knn = DecisionTreeClassifier(max_depth=100, criterion='gini', min_samples_leaf=11)
knn = RandomForestClassifier(n_estimators=18, random_state=0) # 50, 87 # 19


knn.fit(df_x_sc, df_y)

df_train_ped = knn.predict(df_x_sc)
print(accuracy_score(df_train_ped, df_y))

0.9975961538461539


### Kaggle submission

In [108]:
df = pd.read_csv("assets/test.csv", index_col=0)

#df = df.drop('Find Distance from Main Vulcano (km)', axis=1) 
#df = df.drop('Atmospheric Pressure at Harvest (Pa)', axis=1) 
#df = df.drop('Magnetic orientation (degree)', axis=1) 
#df = df.drop('Soil pH where Grown', axis=1) 

columns = df.columns
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') # other popular choices: "median", "most_frequent"
imputer.fit(df.values)
df = imputer.transform(df.values)
df = pd.DataFrame(df, columns=columns)

for i in df.columns:
    z_scores = (df[i] - np.mean(df[i])) / np.std(df[i])
    df_clean = df[np.abs(z_scores) < 1]

df_test_sc = sc.transform(df)

df_test_pred = knn.predict(df_test_sc)

df_results = pd.DataFrame(data=df_test_pred.astype(int), columns=["Edible"])
df_results.index.names = ["index"]

df_results.to_csv('assets/results.csv')
