<h1> KAGGLE ASSIGNMENT - UNSUPERVISED LEARNING </H1>

ACF(k) = Σ (x_t - μ)(x_{t-k} - μ) / Σ (x_t - μ)^2

In [59]:
#21 min to run the whole program
import pandas as pd
from sklearn.preprocessing import RobustScaler
import numpy as np
import statistics as st
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import kurtosis, skew

#auto correlation function at lag k

#it extracts the relationship between past and future data in a time series : about 10 autocorrelations are usually valuable
def calculate_auto_corr(time_series_signal,k):
    series_mean=np.mean(time_series_signal)

    l=len(time_series_signal)

    autocorrelation_num=np.sum((time_series_signal[:l-k]-series_mean)*(time_series_signal[k:]-series_mean))
    autocorrelation_den=np.sum((time_series_signal-series_mean)**2)

    return autocorrelation_num/autocorrelation_den

<br>
<br>
<br>
<br>
<h3>Reducing No. of Columns, forming 5 sec segments and Projecting Data into higher dimension </h3>

In [60]:
# 6 min to train
train_path="/content/Train.csv"
test_path="/content/Test.csv"

df_train=pd.read_csv(train_path)

df_train_subset=df_train.iloc[:, :-2]
data=np.array(df_train_subset).tolist()

pre_processed_data=[]
for a in data:

    new_row=[]

    for b in range(0,len(a),125): #taking data 125 points at a time (basically making chunks/segments of 5sec each) then projecting into higher dimension
       #6250 cols converted to 50 points

       arr=np.array(a[b:b+125])
       auto_list=[]    # List to store the autocorrelation features
       lags=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]


       for lag in lags:            # Extract autocorrelation coefficients at the specified lags
            curr_lag=calculate_auto_corr(arr,lag)
            auto_list.append(curr_lag)




       mean=np.mean(arr)
       mean_ad=np.mean(np.abs(arr-np.mean(arr)))
       std=np.std(arr)
       iqr=np.percentile(arr,75)-np.percentile(arr,25) #inter-quartile percentile
       mad=np.median(np.abs(arr-np.median(arr)))

       curr_segment_feature_vector=[mean,np.median(arr),iqr,std,np.var(arr),np.ptp(arr),mad,mean_ad,auto_list[0],auto_list[1],auto_list[2],auto_list[3],auto_list[4],auto_list[5],auto_list[6],auto_list[7],auto_list[8],auto_list[9],auto_list[10],auto_list[11],auto_list[12],auto_list[13],auto_list[14]]
       new_row.append(curr_segment_feature_vector)


    pre_processed_data.append(new_row)

<br>
<br>
<br>
<br>
<h3>Segregating Data based on sensors </h3>

In [61]:
sensors=df_train.iloc[:,-2]


#Create a mapping of sensor names to all data-items (values) of that sensor
sensor_to_data={}
for a in sensors:
    if a not in sensor_to_data:
       sensor_to_data[a]=[]


for i in range(0,len(sensors)):
    sensor_to_data[sensors[i]].append(pre_processed_data[i])


<br>
<br>
<br>
<br>

<h3>Train 45 unsupervised KNN Models</h3>

In [65]:
# value of k for KNN

k=5
clusters={}


# Fit the model to the data and get cluster assignments for each sensor data. Training 45 models for KNN
for a in sensor_to_data:
    #152 classes in total and each class has 50 points

    pseudo_labels_map=[]
    curr_data=sensor_to_data[a]
    data_for_knn=[]
    cnt=0
    for b in curr_data:
       data_for_knn.extend(b)

       curr_label=[cnt]*50
       pseudo_labels_map.extend(curr_label)
       cnt+=1



    data_for_knn=np.array(data_for_knn)
    pseudo_labels_map=np.array(pseudo_labels_map)


    #using the unsupervised version of KNN
    neigh=NearestNeighbors(n_neighbors=k)
    neigh.fit(data_for_knn)
    clusters[a]=neigh

<br>
<br>
<br>
<br>
<h3>Make a prediction for the testing values after column reduction , segment formation and higher dimension projection</h3>

In [66]:
# 6min to train

df2=pd.read_csv(test_path)
df2_subset=df2.iloc[:, :-2]
output_data=np.array(df2_subset).tolist()


#use majority voting to find the class to be alloted based on k nearest neighbors for the test point
def find_class_using_majority_voting(index_list,distances,pseudo_labels_map):

    freq_dic={}
    for a in index_list[0].tolist():
        if a in freq_dic:
            freq_dic[a]+=1
        else:
            freq_dic[a]=1

    mode=max(freq_dic,key=freq_dic.get)
    ans=mode
    return pseudo_labels_map[ans]



#generate output list by using the trained models
def generate_output_list(row_means2,clusters,sensors2,IDs,k=5):
    output_list=[]
    for i in range(0,len(row_means2)):
        curr_id=IDs[i]
        curr_knn=clusters[sensors2[i]]
        distances,indices=curr_knn.kneighbors(np.array(row_means2[i]).reshape(1,23), k)
        ans=find_class_using_majority_voting(indices,distances,pseudo_labels_map)
        output_list.append([curr_id,ans])
    return output_list





row_means2=[] #test data points (converting 125 cols to one col and projecting to a higher dimension)
for a in output_data:


    auto_list=[]    # List to store the autocorrelation features
    lags= range(1, 16)   # Specify the lags of interest (e.g., lags 1 to 10)

    for lag in lags:     # Extract autocorrelation coefficients at the specified lags
        curr_lag=calculate_auto_corr(a,lag)
        auto_list.append(curr_lag)


    mean=np.mean(a)
    std=np.std(a)
    mad=np.median(np.abs(a-np.median(a)))
    mean_ad=np.mean(np.abs(a-np.mean(a)))
    iqr=np.percentile(np.array(a),75)-np.percentile(np.array(a),23)

    curr_test_feature_vector=[mean,np.median(a),iqr,std,np.var(a),np.ptp(a),mad,mean_ad,auto_list[0],auto_list[1],auto_list[2],auto_list[3],auto_list[4],auto_list[5],auto_list[6],auto_list[7],auto_list[8],auto_list[9],auto_list[10],auto_list[11],auto_list[12],auto_list[13],auto_list[14]]
    row_means2.append(curr_test_feature_vector)



sensors2=df2.iloc[:,-2]
IDs=df2.iloc[:,-1]
output_list=generate_output_list(row_means2,clusters,sensors2,IDs)

output_df=pd.DataFrame(output_list, columns=['ID', 'TARGET'])


csv_file_name ='/content/submission.csv'


# Write the DataFrame to a new CSV file
output_df.to_csv(csv_file_name, index=False)

<br><br>
<h3> Starting Google Colab and  downloading datasets </h3>

In [2]:
! mkdir ~/.kaggle
! cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json
# !chmod 600 ~/.kaggle/kaggle.json

! kaggle competitions download -c unsupervised-learning-m2023
! unzip unsupervised-learning-m2023.zip


Downloading unsupervised-learning-m2023.zip to /content
 84% 122M/144M [00:00<00:00, 250MB/s] 
100% 144M/144M [00:00<00:00, 242MB/s]
Archive:  unsupervised-learning-m2023.zip
  inflating: Test.csv                
  inflating: Train.csv               


In [None]:
#1D
# Direct 1D distance formula on median (6250 cols-> 1col ) -> 12%
# KMeans -> <10%





# test data is also of the form mean, median (125 cols->1 col)

#2D
# KNN - point, point (6250 cols->6250 cols with k=3125) -> 13%
# KNN - 50 set means, medians (6250 cols ->125 cols with k-63) -> 23%
# KNN - 125 set means, medians (6250 cols->50 cols with k=25) -> 26%    increasing k more than n/2 doesn't increase accuracy more
# KNN - 250 set means, medians (6250 cols->25 cols with k=13) -> 25%

#3/4/and higher dimensional
# KNN - 125 set means, medians , mode (6250 cols->50 cols with k=25) -> 21%
# KNN - 125 set means, medians , mode , iqr (6250 cols->50 cols with k=25) -> 29%
# KNN - 125 set means, medians , iqr (6250 cols->50 cols with k=25) -> 35%
# KNN - 125 set means, medians , iqr , standard dev (6250 cols->50 cols with k=25) ->
# KNN - 125 set means, medians , iqr , var  (6250 cols->50 cols with k=25) ->
# KNN - 125 set means, medians , iqr , standard dev , var (6250 cols->50 cols with k=25) -> 37%
# -------------------------------------------------------------------------------------------------------
# KNN - 125 set means, medians , iqr , standard dev , var , range (6250 cols->50 cols with k=63)  -> 38%       (sub 23)
# KNN - 125 set means, medians , iqr , standard dev , var , range, COV (6250 cols->50 cols with k=63)  ->           (sub 24)
# KNN - 125 set means, medians , iqr , standard dev , var , COV (6250 cols->50 cols with k=63)  -> 22%                 (sub 22)
# KNN - 125 set means, medians , iqr , standard dev , var , COV,range (6250 cols->50 cols with k=63)  -> --           (sub 21)
# KNN - 125 set means, medians , iqr , var (6250 cols->50 cols with k=63)  -> 37%           (sub 21)
# -----------------------------------------------------------------------------------------------------------

# KNN - 125 set means, medians , iqr , standard dev , var , range (6250 cols->50 cols with k=25)  -> 40%        bestie
# ------------------------------------------------------------------------------------------------------------
# KNN - 125 set means, medians , iqr , standard dev , var , range (6250 cols->50 cols with k=45)  -> 38%



#try increasing test data cols also   X not a good idea
# KNN - 125 set means, medians , var , range , mad (6250 cols->50 cols with k=25)  ->  23%    [5 test cols]
# KNN - 125 set means, medians , std, var , range , mad (6250 cols->50 cols with k=25)  ->  28%    [5 test cols with corner = case]
# ------------------------------------------------------------------------------
# KNN - 125 set means, medians , std, var , range  (6250 cols->50 cols with k=25)  ->   16%   [125 test cols with corner = case]






# Non linear SVMs

# Non linear SVM - 125 set means, medians , iqr , standard dev , var , range (6250 cols->50 cols with gamma='scale' and C=1)  -> 28% (SUB 35)
# Non linear SVM - 50 set means, medians , iqr , standard dev , var , range (6250 cols->125 cols with gamma='scale' and C=1)  ->









# Random Forests

# Random Forest - 125 set means, medians , iqr , standard dev , var , range (6250 cols->50 cols with n_estimators=100 , random_state=42)