In [25]:
import pandas as pd
import nltk
import numpy as np

nltk.download('punkt')


!pip install emoji==1.5.0
# Intersection code, the DataFrame should be empty
import re
import pickle
import emoji

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

word_to_index = {}

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[0m

# Intersection code, the DataFrame should be empty

In [26]:
df_test = pd.read_csv("/kaggle/input/testdatadeslab/hindi_test.csv")
df_train = pd.read_csv("/kaggle/input/traindatadeslab/hindi_train_val.csv")

df_test = df_test.merge(df_train, on="text")
df_test

Unnamed: 0,label_x,text,label_y


In [27]:

# TEST = False
# file = "/kaggle/input/traindatadeslab/hindi_train_val.csv"

TEST = True
file = "/kaggle/input/testdatadeslab/hindi_test.csv"


# Helper functions

In [28]:
def extract_emojis(s):
    return ''.join((' '+c+' ') if c in emoji.UNICODE_EMOJI['en'] else c for c in s)



def word_mapping_train(sentence):
    mapping = []
    for word in sentence:
        try:
            mapping.append(word_to_index[word])
        except:
            word_to_index[word] = len(word_to_index)
            mapping.append(word_to_index[word])
    return mapping

def word_mapping_test(sentence):
    mapping = []
    for word in sentence:
        try:
            mapping.append(word_to_index[word])
        except:
            pass
    return mapping


# Data preprocessing

In [29]:
def load_data(file, test=False):
    df = pd.read_csv(file)

    df["text"] = df["text"].apply(extract_emojis)
    df["text"] = df["text"].apply(nltk.word_tokenize)
    
    mapping_function = word_mapping_train if not test else word_mapping_test
    df["text"] = df["text"].apply(mapping_function)
    
    return df

def tfidfFitTransform(df):
    
    words_indexs = df["text"].values
    words_vectors = np.zeros((len(df), len(word_to_index)), dtype=int)

    for i, sentence in enumerate(words_indexs):
        words_vectors[i][sentence] += 1


    np_words_vectors = words_vectors
    words_vectors = list(words_vectors)
    
    idf_vector = np.log(len(np_words_vectors)/np.sum(np_words_vectors, axis=0))
    
   
    tf_idf = np_words_vectors * idf_vector
    return tf_idf, idf_vector

def tfidfTransform(df, idf_vector):
    words_indexs = df["text"].values
    words_vectors = np.zeros((len(df), len(word_to_index)), dtype=int)

    for i, sentence in enumerate(words_indexs):
        words_vectors[i][sentence] += 1

    np_words_vectors = words_vectors
    tf_idf = np_words_vectors * idf_vector
    return tf_idf
    

def remove_stopwords(tf_idf, idf_vector):
    
    stopwords = np.logical_or(idf_vector < 2.5, idf_vector > 8.5) 
    stopword_index = []
    for word, index in word_to_index.items():
        if stopwords[index] != 0:
            stopword_index.append(index)

    tf_idf = np.delete(tf_idf, stopword_index, axis = 1)
    
    return tf_idf
    

# KNN

# Training

In [30]:
def training():
    df = load_data(file, test=False)
    tf_idf, idf_vector = tfidfFitTransform(df)
    tf_idf = remove_stopwords(tf_idf, idf_vector)
    
    X = tf_idf
    y = df[['label']]
    y = np.array(y).reshape((-1,))

    best_k = 0
    K = range(2,21)
    
    score = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    for k in K:
        knn = KNeighborsClassifier(n_neighbors = k)
        knn.fit(X_train,y_train)
        acc = knn.score(X_test,y_test)
        score.append(acc)
        print(f"k = {k} acc: {acc}")
        
    best_k = np.argmax(np.array(score)) + 2
    print()
    print(f"best_k: {best_k} acc: {score[best_k - 2]}")
    
    best_knn = KNeighborsClassifier(n_neighbors = k)
    best_knn.fit(X,y)
    
    pickle.dump(best_knn, open("knn.sav", 'wb'))
    
    pickle.dump(word_to_index, open("word_to_index.sav", 'wb'))
    pickle.dump(idf_vector, open("idf_vector.sav", 'wb'))
    print("Model Saved")

# Testing

In [31]:
def testing():
    print("Testing...")
    global word_to_index 
    word_to_index = pickle.load(open("word_to_index.sav", 'rb'))
    idf_vector = pickle.load(open("idf_vector.sav", 'rb'))
    knn = pickle.load(open("knn.sav", 'rb'))
    
    idf_vector = np.array(idf_vector)
    
    print("Model loaded")
    
    df = load_data(file, test=True)
    
    tf_idf = tfidfTransform(df, idf_vector)
    
    tf_idf = remove_stopwords(tf_idf, idf_vector)
    
    X = tf_idf
    y = df[['label']]
    y = np.array(y).reshape((-1,))
    
    pred = knn.predict(X)
    return  pred, y

In [32]:
if TEST:
    pred, y_true = testing()
    print(classification_report(y_true, pred))
    y_pred_df = pd.Series(pred)
    result_csv = y_pred_df.to_csv("resultKNN.csv", index=False)
    print("Result saved in resultKNN.csv")
else:
    training()

Testing...
Model loaded
              precision    recall  f1-score   support

           0       0.82      0.55      0.66      3496
           1       0.64      0.87      0.74      3232

    accuracy                           0.70      6728
   macro avg       0.73      0.71      0.70      6728
weighted avg       0.73      0.70      0.70      6728

Result saved in resultKNN.csv
