In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download("punkt", quiet=True)

import ast

from math import exp, sqrt, pi, log
from operator import itemgetter

from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

In [2]:
def prepare_text_english(data):

    data_desc_title = data[['description', 'title']]
    data_desc_title_ls = data_desc_title.values.tolist()

    tokenized = []
    for data in data_desc_title_ls:
        tokenized_desc = word_tokenize(data[0].lower())
        tokenized_title = word_tokenize(data[1].lower())
        tokenized.append([tokenized_desc, tokenized_title])

    lemmatizer = WordNetLemmatizer()


    def get_wordnet_pos(word):

        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)


    lemmatizer = WordNetLemmatizer()

    tokenized_lemmatized = []

    for data in tokenized:

        lemmatized_tokenized_desc = []
        for word in data[0]:
            lemmatized_tokenized_desc.append(lemmatizer.lemmatize(word, get_wordnet_pos(word)))
        lemmatized_tokenized_desc = [word for word in lemmatized_tokenized_desc if word.isalnum()]

        lemmatized_tokenized_title = []
        for word in data[1]:
            lemmatized_tokenized_title.append(lemmatizer.lemmatize(word, get_wordnet_pos(word)))
        lemmatized_tokenized_title = [word for word in lemmatized_tokenized_title if word.isalnum()]

        tokenized_lemmatized.append([lemmatized_tokenized_desc, lemmatized_tokenized_title])
        

    return tokenized_lemmatized


def remove_stop_words_english(tokenized_lemmatized):


    list_of_words_frequency = []
    
    for data in tokenized_lemmatized:
        
        for word in data[0]:
            if word not in [i[0] for i in list_of_words_frequency]:
                list_of_words_frequency.append([word, 1])
            else:
                list_of_words_frequency[[i[0] for i in list_of_words_frequency].index(word)][1] += 1

        for word in data[1]:
            if word not in [i[0] for i in list_of_words_frequency]:
                list_of_words_frequency.append([word, 1])
            else:
                list_of_words_frequency[[i[0] for i in list_of_words_frequency].index(word)][1] += 1

    list_of_words_frequency = sorted(list_of_words_frequency, key=lambda l: l[1], reverse=True)


    number_of_stop_words = 10
    stop_words = [i[0] for i in list_of_words_frequency][:number_of_stop_words]
    with open('data/stop_words_english.txt', 'w', encoding='utf-8') as f:
        for page in stop_words:
            f.write("%s\n" % page)
    
    
    tokenized_lemmatized_removed_stop_words = []
    for data in tokenized_lemmatized:
        lemmatized_tokenized_removed_stop_words_desc = [word for word in data[0] if word not in stop_words]
        lemmatized_tokenized_removed_stop_words_title = [word for word in data[1] if word not in stop_words]
        tokenized_lemmatized_removed_stop_words.append(
            [lemmatized_tokenized_removed_stop_words_desc, lemmatized_tokenized_removed_stop_words_title])

        
    return tokenized_lemmatized_removed_stop_words


def add_id_english(tokenized_lemmatized_removed_stop_words):

    merged_id_english = []

    for i in range(len(tokenized_lemmatized_removed_stop_words)):
        merged_id_english.append(
            [i + 1, tokenized_lemmatized_removed_stop_words[i][0], tokenized_lemmatized_removed_stop_words[i][1]])

    return merged_id_english


def preProcess(data,words):
    
    data_desc_title = data[['description', 'title']]
    data_desc_title_ls = data_desc_title.values.tolist()
    tokenized_lemmatized = prepare_text_english(data)
    tokenized_lemmatized_removed_stop_words = remove_stop_words_english(tokenized_lemmatized)
    merged_id_english = add_id_english(tokenized_lemmatized_removed_stop_words)

def preProcess(data,words):
    
    data_desc_title = data[['description', 'title']]
    data_desc_title_ls = data_desc_title.values.tolist()

    if words is None:
        
        tokenized_lemmatized = prepare_text_english(data)
        tokenized_lemmatized_removed_stop_words = remove_stop_words_english(tokenized_lemmatized)
        merged_id_english = add_id_english(tokenized_lemmatized_removed_stop_words)

        arr=[]
        for i in range(len(merged_id_english)):
            dic = {}
            dic['text']=merged_id_english[i][1]
            dic['title'] =merged_id_english[i][2]
            arr.append(dic)
        corpes = []

        for i in tokenized_lemmatized_removed_stop_words:
            corpes.extend(i[0])
            corpes.extend(i[1])
            
        corpes = list(set(corpes))

        return arr, corpes

    else:

        arr = []
        for i in data_desc_title_ls:
            
            dic = {}
            decs = []
            for j in i[0].split():
                if j in words:
                    decs.append(j)

            titles = []
            for j in i[1].split():
                if j in words:
                    titles.append(j)
                    
            dic["text"] = decs
            dic["title"] = titles
            arr.append(dic)

        return arr, words

In [3]:
def make_X(raw_data, words=None, idfs=None, coeff=2):
    
    main_data, words = preProcess(raw_data, words)
    
    n_doc = len(main_data)
    n_word = len(words)
    
    vector_space_title = np.zeros((n_doc, n_word))
    vector_space_text = np.zeros((n_doc, n_word))

    for i in range(n_doc):
        for y in main_data[i]['title']:
            vector_space_title[i, words.index(y)] += 1
        for y in main_data[i]['text']:
            vector_space_text[i, words.index(y)] += 1
    
    vector_space_title_tfs = vector_space_title
    vector_space_text_tfs = vector_space_text
    
    vector_space_title_idfs = []
    vector_space_text_idfs = []
    if idfs is None:
        vector_space_title_idfs = np.log10(n_word / (np.count_nonzero(vector_space_title, axis=0) + 1))
        vector_space_text_idfs = np.log10(n_word / (np.count_nonzero(vector_space_text, axis=0) + 1))
    else:
        vector_space_title_idfs = idfs[0]
        vector_space_text_idfs = idfs[1]
    
    vector_space_title_tfidfs = np.multiply(vector_space_title_tfs, vector_space_title_idfs)
    vector_space_text_tfidfs = np.multiply(vector_space_text_tfs, vector_space_text_idfs)
    
    final_vector_space = coeff * vector_space_title_tfidfs + vector_space_text_tfidfs
    
    return final_vector_space, words, (vector_space_title_idfs, vector_space_text_idfs)

In [4]:
train_data_phase2 = pd.read_csv("./data/train.csv")
test_data_phase2 = pd.read_csv("./data/test.csv")
data_phase1 = pd.read_csv("./data/ted_talks.csv")

X_train, words, idfs = make_X(train_data_phase2)
X_test, _, _ = make_X(test_data_phase2, words, idfs=idfs)
X_phase1, _, _ = make_X(data_phase1, words)

y_train = train_data_phase2["views"].values
y_test = test_data_phase2["views"].values

X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

In [5]:
def evaluate(y_true, y_predicted, epsilon=10**-9):
    
    TP, TN, FP, FN = 0, 0, 0, 0
    for y1, y2 in zip(y_true, y_predicted):
        if y1 == y2:
            if y1 == 1:
                TP += 1
            elif y2 == -1:
                TN += 1
        elif y1 != y2:
            if y1 == 1:
                FN += 1
            elif y1 == -1:
                FP += 1
    
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP + epsilon)
    recall = TP / (TP + FN + epsilon)
    specificity = TN / (TN + FP + epsilon)
    f1 = (2 * precision * recall) / (precision + recall + epsilon)
    
    dic = {"Precision": precision, "Recall": recall, "Accuracy": accuracy, "F1_Score": f1,
           "Sensitivity": recall, "Specificity": specificity}
    return dic

In [6]:
def euclidianDistance(sample1, sample2):
    dis = np.linalg.norm(sample1 - sample2)
    return dis

def euclidianDistance2(sample1, sample2):
    distance = 0
    for i in range(len(sample1)):
        distance += pow(sample1[i] - sample2[i], 2)
    distance = math.sqrt(distance)
    return distance


class KNN:

    def __init__(self, k, dis):
        self.k = k
        self.dis = dis
        self.X = []
        self.y = []

    def distance(self, sample1, sample2, distance_type):
        if distance_type == 'euclidian':
            return euclidianDistance(sample1, sample2)
        
    def fit(self, X, y):
        self.X = X
        self.y = y

    def predict(self, test):

        predict = []
        
        for x in test:

            temp = []
            for y in self.X:
                temp += [self.distance(x, y, self.dis)]

            temp = list(zip(temp, self.y))
            temp = sorted(temp, key=lambda a_entry: a_entry[0])
            temp = np.array(temp)
            temp = temp[:self.k, 1]
            temp = np.unique(temp, return_counts=True)
            predict += [temp[0][temp[1].argmax()]]
        
        return np.array(predict).reshape(-1)

In [7]:
K = [1, 5, 9]

best_acc1 = 0.0
best_value = 0
best_clf_knn = None

for k in K:
    
    clf = KNN(k=k, dis='euclidian')
    clf.fit(X_train, y_train)
    predicted = clf.predict(X_validation)
    acc = evaluate(y_validation, predicted)["Accuracy"]
    
    if acc >= best_acc1:
        best_acc1 = acc
        best_value = k
        best_clf_knn = clf
        
print(best_acc1)
print(best_value)

0.5623188405797102
9


In [8]:
predicted = best_clf_knn.predict(X_test)
print(evaluate(y_test, predicted))

{'Precision': 0.6363636363347107, 'Recall': 0.11570247933788676, 'Accuracy': 0.5490196078431373, 'F1_Score': 0.19580419554110226, 'Sensitivity': 0.11570247933788676, 'Specificity': 0.9402985074556695}


In [9]:
class Classifier:
    def __init__(self, train_features, train_labels, test_features=None, test_labels=None):
        self.train_features = train_features
        self.train_labels = train_labels

        self.test_features = test_features
        self.test_labels = test_labels
        self.test_predictions = None

    def train(self):
        pass

    def predict(self, test_features):
        pass

def gaussian_probability(x, mean, stdev):
    stdev += 1
    exponent = exp(-((x - mean) ** 2 / (2 * stdev ** 2)))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

class NaiveBayes(Classifier):
    
    def __init__(self, train_features, train_labels, test_features=None, test_labels=None):
        super().__init__(train_features, train_labels, test_features, test_labels)
        self.label_feature_summaries = dict()
        self.label_dictionary = dict()

    def create_label_dictionary(self):
        for label, features in zip(self.train_labels, self.train_features):
            if label not in self.label_dictionary:
                self.label_dictionary[label] = []
            self.label_dictionary[label] += [features]

    def summarize_dataset(self, label, documents):
        summaries = [(np.mean(column), np.std(column), len(column)) for column in zip(*documents)]
        self.label_feature_summaries[label] = summaries

    def create_label_summaries(self):
        for label in self.label_dictionary:
            self.summarize_dataset(label, self.label_dictionary[label])

    def train(self):
        self.create_label_dictionary()
        self.create_label_summaries()
        return

    def predict_one(self, ind, test):
        total_doc_count = len(self.train_features)
        class_probabilities = dict()
        for label in self.label_feature_summaries:
            label_summary = self.label_feature_summaries[label]
            class_probabilities[label] = log(label_summary[0][-1] / total_doc_count)
            for feature, feature_summary in zip(test, label_summary):
                mean, std, _ = feature_summary
                class_probabilities[label] += log(gaussian_probability(feature, mean, std))
        self.test_predictions[ind] = max(class_probabilities.items(), key=itemgetter(1))[0]

    def predict(self, test_features=None):
        if self.test_features is None:
            self.test_features = test_features
        self.test_predictions = [None for _ in self.test_features]
        for i in range(len(self.test_features)):
            self.predict_one(i, self.test_features[i])

In [10]:
NB = NaiveBayes(X_train, y_train, X_validation, y_validation)
NB.train()
NB.predict()

best_acc2 = evaluate(y_test, NB.test_predictions)["Accuracy"]

In [11]:
NB = NaiveBayes(X_train, y_train, X_test, y_test)
NB.train()
NB.predict()

print(evaluate(y_test, NB.test_predictions))

{'Precision': 0.5666666666572222, 'Recall': 0.28099173553486784, 'Accuracy': 0.5568627450980392, 'F1_Score': 0.37569060728744547, 'Sensitivity': 0.28099173553486784, 'Specificity': 0.8059701492477167}


In [12]:
# class NaiveBayes2:
    
#     def __init__(self, laplace_smooth=0, c=0):
#         self.laplace_smooth = laplace_smooth
#         self.c = c
#         self.p_f_x = []
#         self.p_y = []
#         self.num_class = 0
    
#     def fit(self, X, y):
    
#         self.num_class = len(list(set(y)))
#         numbers = [[] for _ in range(self.num_class)]
#         for x, z in zip(X, y):
#             numbers[z] += [list(x)]

#         self.p_y = []
#         for x in numbers:
#             self.p_y += [(len(x) + self.laplace_smooth) / (len(y) + len(numbers) * self.laplace_smooth)]

#         self.p_f_x = [[self.laplace_smooth] * len(X[0]) for _ in range(self.num_class)]
#         for i in range(self.num_class):
#             for j in range(len(X[0])):
#                 for x in numbers[i]:
#                     if x[j] > self.c:
#                         self.p_f_x[i][j] += 1
#                 self.p_f_x[i][j] /= len(numbers[i])

#     def predict(self, test):
    
#         predict = []
#         for x in test:
#             res = []
#             for i in range(self.num_class):
#                 p = self.p_y[i]
#                 for j in range(len(x)):
#                     if x[j] > self.c:
#                         p *= self.p_f_x[i][j]
#                     else:
#                         p *= 1 - self.p_f_x[i][j]
#                 res += [p]
#             predict += [res]

#         predict = np.array(predict)
#         predict = np.argmax(predict, axis=1)
#         return predict

In [13]:
# L = [0.1, 0.3, 0.5]

# best_acc2 = 0.0
# best_value = 0
# best_clf_naive_bayes = None

# for l in L:
    
#     clf = NaiveBayes2(laplace_smooth=l, c=0.1)
#     clf.fit(X_train, y_train)
#     predicted = clf.predict(X_validation)
#     acc = evaluate(y_validation, predicted)["Accuracy"]
    
#     if acc >= best_acc2:
#         best_acc2 = acc
#         best_value = l
#         best_clf_naive_bayes = clf
        
# print(best_acc2)
# print(best_value)

In [14]:
# predicted = best_clf_naive_bayes.predict(X_test)
# print(evaluate(y_test, predicted))

In [15]:
C_List = [1/2, 1, 3/2, 2]

best_acc3 = 0.0
best_value = 0
best_clf_svc = None


for c in C_List:
    
    clf = svm.SVC(C=c, kernel="rbf")
    clf.fit(X_train, y_train)
    predicted = clf.predict(X_validation)
    acc = evaluate(y_validation, predicted)["Accuracy"]
    
    if acc >= best_acc3:
        best_acc3 = acc
        best_value = c
        best_clf_svc = clf
        
print(best_acc3)
print(best_value)

0.6086956521739131
2


In [16]:
predicted = best_clf_svc.predict(X_test)
print(evaluate(y_test, predicted))

{'Precision': 0.7551020408009164, 'Recall': 0.305785123964415, 'Accuracy': 0.6235294117647059, 'F1_Score': 0.4352941172316263, 'Sensitivity': 0.305785123964415, 'Specificity': 0.9104477611872356}


In [17]:
N_Estimators = [50, 100, 200, 500]

best_acc4 = 0.0
best_value = 0
best_clf_random_forest = None

for n in N_Estimators:
    
    clf = RandomForestClassifier(n_estimators=n)
    clf.fit(X_train, y_train)
    predicted = clf.predict(X_validation)
    acc = evaluate(y_validation, predicted)["Accuracy"]
    
    if acc >= best_acc4:
        best_acc4 = acc
        best_value = n
        best_clf_random_forest = clf
        
print(best_acc4)
print(best_value)

0.6086956521739131
100


In [18]:
predicted = best_clf_random_forest.predict(X_test)
print(evaluate(y_test, predicted))

{'Precision': 0.7674418604472688, 'Recall': 0.2727272727250188, 'Accuracy': 0.615686274509804, 'F1_Score': 0.4024390239984384, 'Sensitivity': 0.2727272727250188, 'Specificity': 0.9253731343214525}


In [19]:
accs = [best_acc1, best_acc2, best_acc3, best_acc4]
best_index = np.argmax(accs)
print(accs[best_index])

best_clf = None
if best_index == 0:
    best_clf = best_clf_knn
    print("KNN")
elif best_index == 1:
    best_clf = best_clf_naive_bayes
    print("Naive Bayes")
elif best_index == 2:
    best_clf = best_clf_svc
    print("SVC")
elif best_index == 3:
    best_clf = best_clf_random_forest
    print("Random Forest")

0.6086956521739131
SVC


In [20]:
y_phase1 = best_clf.predict(X_phase1)

In [21]:
def string_preProcess_english(str):
    
    tokenized_str = word_tokenize(str.lower())
    lemmatizer = WordNetLemmatizer()

    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)

    lemmatizer = WordNetLemmatizer()

    lemmatized_tokenized = []
    for word in tokenized_str:
        lemmatized_tokenized.append(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

    lemmatized_tokenized = [word for word in lemmatized_tokenized if word.isalnum()]
    with open('data/stop_words_english.txt', encoding='utf-8') as f:
        lines = f.read().splitlines()
    stop_words = lines

    tokenized_lemmatized_removed_stop_words_str = [word for word in lemmatized_tokenized if
                                                   word not in stop_words]
    
    return tokenized_lemmatized_removed_stop_words_str

In [22]:
def stringSearch(string, X, Y, words, index=0, ktop=20):
    
    string_split = string_preProcess_english(string)
    print(string_split)
    words_dic_tfidfs = {x:string_split.count(x) for x in string_split}
                                       
    scores = {}
    for i in range(len(X)):
        s = 0
        for y in words_dic_tfidfs:
            try:
                s += words_dic_tfidfs[y] * X[i][words.index(y)]
            except ValueError:
                continue
        scores[i] = s
    
    if index == 0:
        final = sorted(scores, key=scores.get, reverse=True)[:ktop]
    else:
        semi_final = sorted(scores, key=scores.get, reverse=True)
        final = []
        for x in semi_final:
            if Y[x] == index:
                final += [x]
        final = final[:ktop]
    
    return final

In [23]:
while True:
    
    print("Please enter 0 if you do not want to use view, 1 and -1 if you want to use, and 404 if you want to exit!")
    
    s = input()
    
    if s == '0':
        print("Please enter your text.")
        string = input()
        
        output = stringSearch(string, X_phase1, y_phase1, words, index=0)
        print(output)
        print()
        
    elif s == '1':
        print("Please enter your text.")
        string = input()
        
        output = stringSearch(string, X_phase1, y_phase1, words, index=1)
        print(output)
        print()
        
    elif s == '-1':
        print("Please enter your text.")
        string = input()
        
        output = stringSearch(string, X_phase1, y_phase1, words, index=-1)
        print(output)
        print()
        
    elif s == '404':
        print("END.")
        break
        
    else:
        print("Please enter a number according to the guideline!")

Please enter 0 if you do not want to use view, 1 and -1 if you want to use, and 404 if you want to exit!
0
Please enter your text.
car machine computer
['car', 'machine', 'computer']
[2004, 948, 240, 1730, 2121, 901, 1539, 363, 1283, 270, 750, 893, 1130, 1293, 1349, 1655, 1693, 1962, 2155, 1941]

Please enter 0 if you do not want to use view, 1 and -1 if you want to use, and 404 if you want to exit!
1
Please enter your text.
car machine computer
['car', 'machine', 'computer']
[2004, 1730, 1539, 893, 1655, 1962, 2155, 1560, 2110, 2236, 2318, 2503, 2139, 2153, 13, 812, 984, 1028, 1424, 1551]

Please enter 0 if you do not want to use view, 1 and -1 if you want to use, and 404 if you want to exit!
-1
Please enter your text.
car machine computer
['car', 'machine', 'computer']
[948, 240, 2121, 901, 363, 1283, 270, 750, 1130, 1293, 1349, 1693, 1941, 2502, 227, 598, 702, 1826, 2401, 2460]

Please enter 0 if you do not want to use view, 1 and -1 if you want to use, and 404 if you want to exit!


In [24]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1000)
X_train2 = pca.fit_transform(X_train)
X_validation2 = pca.transform(X_validation)
X_test2 = pca.transform(X_test)
X_pahse1_2 = pca.transform(X_phase1)

In [25]:
C_List = [0.01, 0.1, 1, 10, 100]

best_acc5 = 0.0
best_value = 0
best_clf_svc2 = None


for c in C_List:
    
    clf = svm.SVC(C=c, kernel="rbf")
    clf.fit(X_train2, y_train)
    predicted = clf.predict(X_validation2)
    acc = evaluate(y_validation, predicted)["Accuracy"]
    
    if acc >= best_acc5:
        best_acc5 = acc
        best_value = c
        best_clf_svc2 = clf
        
print(best_acc5)
print(best_value)

0.5971014492753624
1


In [26]:
predicted = best_clf_svc2.predict(X_test2)
print(evaluate(y_test, predicted))

{'Precision': 0.7878787878549128, 'Recall': 0.2148760330560754, 'Accuracy': 0.6, 'F1_Score': 0.3376623373212177, 'Sensitivity': 0.2148760330560754, 'Specificity': 0.947761194022778}
