In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from google.colab import drive
from sklearn import preprocessing
import os

COLAB = True

In [2]:
RANDOM_STATE = 30255
NUM_EPOCHS = 3

if COLAB:
  drive.mount('/content/gdrive')
  PATH = "gdrive/Shareddrives/Adv ML Project/Data/"
  df = pd.read_csv(os.path.join(PATH + "preprocessed_data.csv"))

else:
  df = pd.read_csv('../data/preprocessed_data.csv')
  df = df.sample(n=100, random_state=RANDOM_STATE).reset_index()


le = preprocessing.LabelEncoder()
le.fit(df['CLASS'])
df['LABEL'] = le.transform(df['CLASS'])

df.head()

Mounted at /content/gdrive


Unnamed: 0,DESCRIPTION,SUBJECT,MAIN_SUBJECT,CLASS,BERT_TOKENIZED,SPACY_PREPROCESSED,LABEL
0,The United States Department of Energy Vehicle...,"['33 Advanced Propulsion Systems', '36 Materia...",33 Advanced Propulsion Systems,"Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2142, 2...",united states department energy vehicle techno...,0
1,Solar reflective “cool pavements” have been pr...,"['32 Energy Conservation, Consumption, And Uti...","32 Energy Conservation, Consumption, And Utili...","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 5943, 21346, 1...",solar reflective cool pavement propose potenti...,0
2,Inconel 718 alloy is used extensively in aerog...,"['36 Materials Science', '33 Advanced Propulsi...",33 Advanced Propulsion Systems,"Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 4297, 5643, 2...",inconel alloy extensively aerogas turbine allo...,0
3,The Production Tax Credit (PTC) and the Invest...,"['29 Energy Planning, Policy, And Economy', 'P...","29 Energy Planning, Policy, And Economy","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2537, 4...",production tax credit ptc investment tax credi...,0
4,The production tax credit (PTC) promotes wind ...,"['29 Energy Planning, Policy, And Economy', '1...","29 Energy Planning, Policy, And Economy","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2537, 4...",production tax credit ptc promote wind energy ...,0


In [3]:
display(df[['CLASS', 'LABEL']].drop_duplicates())

tmp_dict = df[['CLASS', 'LABEL']].drop_duplicates().set_index('LABEL').to_dict('index')
CATEGORY_DICT = {label: sub_dict['CLASS'] for label, sub_dict in tmp_dict.items()}

Unnamed: 0,CLASS,LABEL
0,"Energy Storage, Conversion, and Utilization",0
1223,Environmental Sciences,1
2446,Fission and Nuclear Technologies,2
3669,Fossil Fuels,3
4892,Renewable Energy Sources,4


In [4]:
X = df['SPACY_PREPROCESSED']
y = df['LABEL']

In [5]:
# Create a TF-IDF vectorizer to convert text to numerical features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      train_size=0.7, test_size=0.15, 
                                                      random_state=RANDOM_STATE,
                                                      shuffle=True)

In [7]:
# Create and train the KNN model
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')  # Adjust the number of neighbors (K) as desired
knn.fit(X_train, y_train)


In [8]:
# Make predictions on the test set
y_pred = knn.predict(X_valid)

# Evaluate the model
print(classification_report(y_valid, y_pred))


              precision    recall  f1-score   support

           0       0.74      0.77      0.76       185
           1       0.75      0.74      0.74       182
           2       0.83      0.84      0.83       191
           3       0.68      0.78      0.72       174
           4       0.74      0.61      0.67       186

    accuracy                           0.75       918
   macro avg       0.75      0.75      0.75       918
weighted avg       0.75      0.75      0.75       918



In [9]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_neighbors': [3, 7, 10, 15, 20, 25],  # Example values for n_neighbors
    'weights': ['uniform', 'distance'],
}

# Create the grid search object
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions using the best model
y_pred = best_model.predict(X_valid)


In [10]:
print(best_model)

KNeighborsClassifier(n_neighbors=20, weights='distance')


In [16]:
print(classification_report(y_valid, y_pred, target_names=list(CATEGORY_DICT.values())))

                                             precision    recall  f1-score   support

Energy Storage, Conversion, and Utilization       0.75      0.83      0.79       185
                     Environmental Sciences       0.82      0.70      0.76       182
           Fission and Nuclear Technologies       0.86      0.84      0.85       191
                               Fossil Fuels       0.65      0.87      0.74       174
                   Renewable Energy Sources       0.80      0.61      0.69       186

                                   accuracy                           0.77       918
                                  macro avg       0.78      0.77      0.77       918
                               weighted avg       0.78      0.77      0.77       918



In [2]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_valid, y_pred)
cm = (cm / cm.sum(axis=1)) * 100

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Create a heatmap 
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=list(CATEGORY_DICT.values()), yticklabels=list(CATEGORY_DICT.values()))
plt.xticks(rotation=45, horizontalalignment='right')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

NameError: ignored