In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA

In [14]:
# Read the data from the file train.csv
df = pd.read_csv('train.csv')
data_matrix = np.array(df.values)

# Drop the first column (id)
data_matrix = np.delete(data_matrix, 0, 1)

# Display as a table
df_train = pd.DataFrame(data_matrix)
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,4096
0,0.0,0.0,1.272801,0.290501,0.581446,0.0,0.0,0,0.0,0.0,...,1.645888,0.86964,0.302432,0.953719,0.022545,0.498048,0.0,0.034988,0.692382,Orange_Ripe
1,0.0,0.0,1.542096,0.0,0.896557,0.049978,0.0,0,0.117847,0.0,...,1.50422,0.622686,0.588427,0.524415,0.305426,0.386204,0.0,0.0,0.668196,Banana_Ripe
2,0.0,0.0,1.098595,0.571866,0.500355,0.0,0.0,0,0.493137,0.0,...,1.169341,0.913239,0.064404,0.53127,0.0,0.471604,0.0,0.0,0.65825,Mango_Raw
3,0.0,0.101666,1.159194,0.599216,0.893206,0.0,0.200139,0,0.645675,0.0,...,0.560686,1.243676,0.432523,0.701881,0.0,0.589985,0.0,0.0,0.591165,Leeche_Raw
4,0.0,0.0,1.178603,0.362568,0.577602,0.0,0.0,0,0.079862,0.0,...,1.206032,0.736831,0.345906,0.878515,0.119,0.261441,0.0,0.0,0.458905,Mango_Ripe


In [15]:
# Perform clustering on the data to get cluster IDs for each data point
# We will use the cluster IDs as an additional feature for classification

# Import KMeans
from sklearn.cluster import KMeans

# Create a KMeans instance with 10 clusters: model
model = KMeans(n_clusters=60)

# Fit model to samples
model.fit(data_matrix[:, :-1])

# Determine the cluster labels of new_points: labels
labels = model.predict(data_matrix[:, :-1])

# Display cluster labels
print(labels)

[48 32 49 ...  7 33 39]


In [16]:
# Add the cluster IDs as an additional feature
data_matrix = np.insert(data_matrix, 0, labels, axis=1)

# Display as a table
df_train = pd.DataFrame(data_matrix)
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4088,4089,4090,4091,4092,4093,4094,4095,4096,4097
0,48,0.0,0.0,1.272801,0.290501,0.581446,0.0,0.0,0,0.0,...,1.645888,0.86964,0.302432,0.953719,0.022545,0.498048,0.0,0.034988,0.692382,Orange_Ripe
1,32,0.0,0.0,1.542096,0.0,0.896557,0.049978,0.0,0,0.117847,...,1.50422,0.622686,0.588427,0.524415,0.305426,0.386204,0.0,0.0,0.668196,Banana_Ripe
2,49,0.0,0.0,1.098595,0.571866,0.500355,0.0,0.0,0,0.493137,...,1.169341,0.913239,0.064404,0.53127,0.0,0.471604,0.0,0.0,0.65825,Mango_Raw
3,22,0.0,0.101666,1.159194,0.599216,0.893206,0.0,0.200139,0,0.645675,...,0.560686,1.243676,0.432523,0.701881,0.0,0.589985,0.0,0.0,0.591165,Leeche_Raw
4,39,0.0,0.0,1.178603,0.362568,0.577602,0.0,0.0,0,0.079862,...,1.206032,0.736831,0.345906,0.878515,0.119,0.261441,0.0,0.0,0.458905,Mango_Ripe


In [17]:
# Opening the test file for the test data
df_test = pd.read_csv('test.csv')
data_matrix_test = np.array(df_test.values)

# Drop the first column (id)
data_matrix_test = np.delete(data_matrix_test, 0, 1)

# Display as a table
df_test = pd.DataFrame(data_matrix_test)
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0.0,0.0,0.908889,0.251257,0.662262,0.042495,0.0,0.0,0.964784,0.0,...,0.694072,1.146161,1.483842,0.717836,0.472616,0.0,0.488022,0.0,0.0,0.65567
1,0.0,0.0,1.191055,0.40735,0.441898,0.0,0.334858,0.0,0.295357,0.0,...,0.273436,1.466932,0.94085,0.470344,1.032085,0.0,0.65407,0.0,0.0,0.614493
2,0.0,0.261903,0.992782,0.301102,0.636006,0.009558,0.009448,0.0,0.974949,0.0,...,0.0,0.769983,0.83436,0.369656,1.000858,0.431571,0.361993,0.0,0.0,0.392158
3,0.0,0.0,1.352401,0.346003,0.401412,0.0,0.0,0.0,0.450667,0.0,...,0.339935,1.325595,0.981124,0.486731,0.747392,0.0,0.300671,0.0,0.0,0.628365
4,0.0,0.0,1.114281,0.69614,0.121505,0.0,0.0,0.0,0.591384,0.0,...,0.093661,0.875113,0.360689,0.65923,0.546044,0.0,0.427255,0.0,0.0,0.835671


In [18]:
# Apply the same clustering to the test data
labels_test = model.predict(data_matrix_test)

# Add the cluster IDs as an additional feature
data_matrix_test = np.insert(data_matrix_test, 0, labels_test, axis=1)

# Display as a table
df_test = pd.DataFrame(data_matrix_test)
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,4096
0,35.0,0.0,0.0,0.908889,0.251257,0.662262,0.042495,0.0,0.0,0.964784,...,0.694072,1.146161,1.483842,0.717836,0.472616,0.0,0.488022,0.0,0.0,0.65567
1,19.0,0.0,0.0,1.191055,0.40735,0.441898,0.0,0.334858,0.0,0.295357,...,0.273436,1.466932,0.94085,0.470344,1.032085,0.0,0.65407,0.0,0.0,0.614493
2,44.0,0.0,0.261903,0.992782,0.301102,0.636006,0.009558,0.009448,0.0,0.974949,...,0.0,0.769983,0.83436,0.369656,1.000858,0.431571,0.361993,0.0,0.0,0.392158
3,42.0,0.0,0.0,1.352401,0.346003,0.401412,0.0,0.0,0.0,0.450667,...,0.339935,1.325595,0.981124,0.486731,0.747392,0.0,0.300671,0.0,0.0,0.628365
4,6.0,0.0,0.0,1.114281,0.69614,0.121505,0.0,0.0,0.0,0.591384,...,0.093661,0.875113,0.360689,0.65923,0.546044,0.0,0.427255,0.0,0.0,0.835671


In [28]:
X_train = data_matrix[:, :-1].copy()
y_train = data_matrix[:, -1].copy()

X_test = data_matrix_test.copy()

# Scale the data
sc = StandardScaler()

X_train = sc.fit_transform(X_train)

X_test = sc.transform(X_test)

print(X_train.shape)
print(X_test.shape)


(1216, 4097)
(415, 4097)


In [29]:
# Create a PCA object
pca = PCA(n_components=600)

# Fit the PCA object to the training data
pca.fit(X_train)

# Transform the training data
X_train = pca.transform(X_train)

# Transform the test data
X_test = pca.transform(X_test)

# Apply LDA
lda = LDA(n_components=19)

lda.fit(X_train, y_train)

# Transform the training data
X_train = lda.transform(X_train)

# Transform the test data
X_test = lda.transform(X_test)

In [23]:
# Split the X_train and y_train into training and validation sets
#X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create a random forest classifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=1000, bootstrap=False, max_depth=90, min_samples_leaf=3, min_samples_split=6, n_jobs=-1, random_state=0, verbose=1)

# Train the classifier
rf.fit(X_train, y_train)

# Predict the validation set
y_pred = rf.predict(X_test)

# Create a submission file
# Opening the file to write the predictions
file = open('PCA_LDA_Clustering.csv', 'w')

# Writing the header
file.write('ID,Category')
for i in range(len(y_pred)):
    file.write('\n')
    file.write(str(i))
    file.write(',')
    file.write(y_pred[i])

file.close()



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    1.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.2s finished


In [22]:
# Predict the test set
y_pred = rf.predict(X_test)

# Create a submission file
# Opening the file to write the predictions
file = open('PCA_LDA_Clustering.csv', 'w')

# Writing the header
file.write('ID,Category')
for i in range(len(y_pred)):
    file.write('\n')
    file.write(str(i))
    file.write(',')
    file.write(y_pred[i])

file.close()

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.1s finished


In [26]:
# Create a Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(n_estimators=200, random_state=0, verbose=1, max_depth=4)

# Train the classifier
gb.fit(X_train, y_train)

# Predict the validation set
y_pred = gb.predict(X_test)



      Iter       Train Loss   Remaining Time 
         1           1.3535           34.42s
         2           1.0584           35.47s
         3           0.8533           35.49s
         4           0.6943           35.63s
         5           0.5721           35.44s
         6           0.4742           35.25s
         7           0.3942           35.02s
         8           0.3289           34.85s
         9           0.2764           34.72s
        10           0.2316           34.65s
        20           0.0418           32.60s
        30           0.0080           30.56s
        40           0.0016           28.64s
        50           0.0003           26.81s
        60           0.0001           25.07s
        70           0.0000           23.23s
        80           0.0000           21.40s
        90           0.0000           19.57s
       100           0.0000           17.71s
       200           0.0000            0.00s


In [25]:
# Calculate the accuracy using numpy
# accuracy = np.sum(y_pred == y_val) / len(y_val)

# print(accuracy)

0.9467213114754098


In [27]:
# Opening the file to write the predictions
file = open('PCA_LDA_Boost_Cluster.csv', 'w')

# Writing the header
file.write('ID,Category')
for i in range(len(y_pred)):
    file.write('\n')
    file.write(str(i))
    file.write(',')
    file.write(y_pred[i])

file.close()