In [104]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import hdbscan

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

# Import Fuzzy c means clustering
from fcmeans import FCM

In [105]:
# Read the data from the file train.csv
df = pd.read_csv('train.csv')
data_matrix = np.array(df.values)

# Drop the first column (id)
data_matrix = np.delete(data_matrix, 0, 1)

# Display as a table
df_train = pd.DataFrame(data_matrix)
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,4096
0,0.0,0.0,1.272801,0.290501,0.581446,0.0,0.0,0,0.0,0.0,...,1.645888,0.86964,0.302432,0.953719,0.022545,0.498048,0.0,0.034988,0.692382,Orange_Ripe
1,0.0,0.0,1.542096,0.0,0.896557,0.049978,0.0,0,0.117847,0.0,...,1.50422,0.622686,0.588427,0.524415,0.305426,0.386204,0.0,0.0,0.668196,Banana_Ripe
2,0.0,0.0,1.098595,0.571866,0.500355,0.0,0.0,0,0.493137,0.0,...,1.169341,0.913239,0.064404,0.53127,0.0,0.471604,0.0,0.0,0.65825,Mango_Raw
3,0.0,0.101666,1.159194,0.599216,0.893206,0.0,0.200139,0,0.645675,0.0,...,0.560686,1.243676,0.432523,0.701881,0.0,0.589985,0.0,0.0,0.591165,Leeche_Raw
4,0.0,0.0,1.178603,0.362568,0.577602,0.0,0.0,0,0.079862,0.0,...,1.206032,0.736831,0.345906,0.878515,0.119,0.261441,0.0,0.0,0.458905,Mango_Ripe


In [106]:
# Opening the test file for the test data
df_test = pd.read_csv('test.csv')
data_matrix_test = np.array(df_test.values)

# Drop the first column (id)
data_matrix_test = np.delete(data_matrix_test, 0, 1)

# Display as a table
df_test = pd.DataFrame(data_matrix_test)
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0.0,0.0,0.908889,0.251257,0.662262,0.042495,0.0,0.0,0.964784,0.0,...,0.694072,1.146161,1.483842,0.717836,0.472616,0.0,0.488022,0.0,0.0,0.65567
1,0.0,0.0,1.191055,0.40735,0.441898,0.0,0.334858,0.0,0.295357,0.0,...,0.273436,1.466932,0.94085,0.470344,1.032085,0.0,0.65407,0.0,0.0,0.614493
2,0.0,0.261903,0.992782,0.301102,0.636006,0.009558,0.009448,0.0,0.974949,0.0,...,0.0,0.769983,0.83436,0.369656,1.000858,0.431571,0.361993,0.0,0.0,0.392158
3,0.0,0.0,1.352401,0.346003,0.401412,0.0,0.0,0.0,0.450667,0.0,...,0.339935,1.325595,0.981124,0.486731,0.747392,0.0,0.300671,0.0,0.0,0.628365
4,0.0,0.0,1.114281,0.69614,0.121505,0.0,0.0,0.0,0.591384,0.0,...,0.093661,0.875113,0.360689,0.65923,0.546044,0.0,0.427255,0.0,0.0,0.835671


In [107]:
X_train = data_matrix[:, :-1].copy()
y_train = data_matrix[:, -1].copy()

X_test = data_matrix_test.copy()

#Scale the data
sc = StandardScaler()

X_train = sc.fit_transform(X_train)

X_test = sc.transform(X_test)

print(X_train.shape)
print(X_test.shape)

(1216, 4096)
(415, 4096)


In [108]:
# Create a PCA object
pca = PCA(n_components=60)

# Fit the PCA object to the training data
pca.fit(X_train)

# Transform the training data
X_train = pca.transform(X_train)

# Transform the test data
X_test = pca.transform(X_test)

# Apply LDA
lda = LDA(n_components=19)

lda.fit(X_train, y_train)

# Transform the training data
X_train = lda.transform(X_train)

# Transform the test data
X_test = lda.transform(X_test)

In [109]:
#Perform KMeans clustering

# Number of clusters
k = 15
# Create a KMeans instance with k clusters: model
model = KMeans(n_clusters=k, random_state=0)

# Fit model to samples
model.fit(X_train)

# Determine the cluster labels of new_points: labels
labels = model.predict(X_train)

# Print the silhouette score
print(silhouette_score(X_train, labels))


0.4155334245965918


In [110]:
# Add the cluster labels to X_train as an additional column
X_train = np.insert(X_train, 0, labels, axis=1)

# Display as a table
df_train = pd.DataFrame(X_train)
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,12.0,0.042571,-1.60672,1.882916,-0.650002,0.532542,0.652163,-0.449165,-1.1207,-3.136151,2.795718,1.079781,0.29727,2.061858,-0.017394,-0.117055,-0.106126,-0.152927,-0.613613,-0.23825
1,2.0,-9.483774,9.981874,3.856625,-3.088943,-2.590716,0.100538,-2.695809,0.388219,0.003939,-1.306911,-0.199922,-0.419499,0.698839,0.498786,-0.71773,0.440142,-0.493623,-0.219645,-2.043925
2,1.0,-3.432217,-3.534378,-3.459121,0.722862,-1.108722,-0.906552,-1.522708,1.770594,-0.783043,3.134315,-1.04393,-0.477468,-1.749867,3.044505,1.049642,1.727844,2.059753,1.038736,-1.327174
3,13.0,4.426165,1.596966,-6.4206,-5.29457,-3.93081,8.176945,2.714277,-2.785338,-2.346949,-1.567775,-4.565667,-4.260765,1.395051,-0.783193,-0.096741,0.041868,-1.484508,0.170008,-0.621402
4,8.0,-1.451724,-3.079433,1.193487,-2.124247,0.54488,-0.826233,0.201759,-1.397589,-1.592018,1.31443,0.053609,0.156484,-0.237233,-2.790798,-1.776633,2.417427,-1.567162,-0.286471,4.453376


In [111]:
# Apply the same clustering to test data

# Determine the cluster labels of new_points: labels
labels_test = model.predict(X_test)

# Add the cluster labels to the data matrix as the first column
X_test = np.insert(X_test, 0, labels_test, axis=1)

# Display as a table
df_test = pd.DataFrame(X_test)
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,13.0,17.13847,-20.882449,-0.863408,-20.396135,-8.168812,12.722543,-2.170165,0.026488,-5.576132,-11.643354,-19.245776,-2.534509,-9.770631,3.013182,2.73876,7.763655,-1.004449,6.476822,8.569841
1,1.0,-1.266213,-1.93977,-1.278824,-0.785335,-1.775535,-1.09043,-2.585518,1.972713,-2.12986,0.510029,-0.536019,-0.982799,-0.550634,1.24931,3.697787,3.059655,2.283885,4.017174,-0.190692
2,5.0,8.851707,1.802146,5.008533,2.152741,0.138774,-0.906209,-1.143919,0.383248,2.733869,-0.420408,0.402977,-1.529428,-0.557455,1.369258,-1.074559,-1.2798,-0.630675,0.458928,1.362931
3,4.0,2.45153,-1.129766,-8.57631,-4.789351,-4.194564,0.84696,1.434483,-0.755792,-0.991778,-4.113129,-0.224938,7.013897,-1.032787,1.075998,0.089068,0.218356,-1.664815,-0.912084,0.832372
4,12.0,1.760574,-1.858522,1.027178,-0.982028,0.942113,-1.200827,1.384009,-4.894794,-2.126578,3.798764,1.84933,1.167311,-1.990908,1.004224,-2.716261,-1.302783,-0.822314,-0.05749,-0.565228


In [None]:
# Perform fuzzy c-means clustering

# fuzzy_model = 

In [36]:
# Find covariance matrix of training data
cov_mat = np.cov(X_train.T)

print(cov_mat.shape)

(4097, 4097)


In [37]:
# Find eigenvalues and eigenvectors of covariance matrix
# eig_vals, eig_vecs = np.linalg.eig(cov_mat)

In [38]:
# # Sort the eigenvalues in descending order with their corresponding eigenvectors
# eigen_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))]
# eigen_pairs.sort(key=lambda k: k[0], reverse=True)

# # Find and print the explained variance by the following number of components

# tot = sum(eig_vals)

# var_exp = [(i / tot) for i in sorted(eig_vals, reverse=True)]
# cum_var_exp = np.cumsum(var_exp)
# # 5 components
# print(cum_var_exp[5])

# # 10 components
# print(cum_var_exp[10])

# # 50 components
# print(cum_var_exp[50])

# # 100 components
# print(cum_var_exp[100])



(0.3776189072375084+0j)
(0.5008372683288967+0j)
(0.7462262376062186+0j)
(0.831056512835016+0j)


In [112]:
# Split the X_train and y_train into training and validation sets
X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [113]:
# Create a random forest classifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=2000, bootstrap=False, max_depth=90, min_samples_leaf=3, min_samples_split=6, n_jobs=-1, random_state=42, verbose=1)

# Train the classifier
rf.fit(X_train2, y_train2)

# Predict the validation set
y_pred = rf.predict(X_val)

# Calculate the accuracy
from sklearn.metrics import accuracy_score

print("Validation Accuracy:", accuracy_score(y_val, y_pred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:    1.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.0s


Validation Accuracy: 0.9795081967213115


[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 2000 out of 2000 | elapsed:    0.1s finished


In [91]:
# Create a Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

# Create a Gradient Boosting model
gb = GradientBoostingClassifier(n_estimators=100)

# Fit the model to the training data
gb.fit(X_train2, y_train2)

# Predict the labels of the test data
y_pred = gb.predict(X_val)

# Calculate the accuracy of the model
accuracy = np.sum(y_pred == y_val) / len(y_val)

print('Validation Accuracy: ', accuracy)

Validation Accuracy:  0.75
