In [None]:
import pandas as pd
import numpy as np

from math import exp, sqrt, pi, log
from operator import itemgetter

from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

In [8]:
def prepare_text_english(data):

    data_desc_title = data[['description', 'title']]

    data_desc_title_ls = data_desc_title.values.tolist()



    ##### tokenization, casefolding#####

    import nltk

    nltk.download('punkt', quiet=True)



    from nltk.tokenize import word_tokenize

    tokenized = []

    for data in data_desc_title_ls:

        tokenized_desc = word_tokenize(data[0].lower())

        tokenized_title = word_tokenize(data[1].lower())

        tokenized.append([tokenized_desc, tokenized_title])



    ###### normalization, lemmatization,and removing punctuation marks########

    import nltk

    nltk.download('wordnet', quiet=True)

    import nltk

    nltk.download('averaged_perceptron_tagger', quiet=True)

    nltk.download("punkt", quiet=True)



    from nltk.stem import WordNetLemmatizer



    lemmatizer = WordNetLemmatizer()

    from nltk.corpus import wordnet



    def get_wordnet_pos(word):

        """Map POS tag to first character lemmatize() accepts"""

        tag = nltk.pos_tag([word])[0][1][0].upper()

        tag_dict = {"J": wordnet.ADJ,

                    "N": wordnet.NOUN,

                    "V": wordnet.VERB,

                    "R": wordnet.ADV}



        return tag_dict.get(tag, wordnet.NOUN)



    lemmatizer = WordNetLemmatizer()



    tokenized_lemmatized = []

    for data in tokenized:

        lemmatized_tokenized_desc = []

        for word in data[0]:

            lemmatized_tokenized_desc.append(lemmatizer.lemmatize(word, get_wordnet_pos(word)))



        lemmatized_tokenized_desc = [word for word in lemmatized_tokenized_desc if word.isalnum()]



        lemmatized_tokenized_title = []

        for word in data[1]:

            lemmatized_tokenized_title.append(lemmatizer.lemmatize(word, get_wordnet_pos(word)))



        lemmatized_tokenized_title = [word for word in lemmatized_tokenized_title if word.isalnum()]



        tokenized_lemmatized.append([lemmatized_tokenized_desc, lemmatized_tokenized_title])

    return tokenized_lemmatized





def remove_stop_words_english(tokenized_lemmatized):

    ######stop words#######

    list_of_words_frequency = []

    for data in tokenized_lemmatized:

        for word in data[0]:

            if (word not in [i[0] for i in list_of_words_frequency]):

                list_of_words_frequency.append([word, 1])

            else:

                list_of_words_frequency[[i[0] for i in list_of_words_frequency].index(word)][1] += 1

        for word in data[1]:

            if (word not in [i[0] for i in list_of_words_frequency]):

                list_of_words_frequency.append([word, 1])

            else:

                list_of_words_frequency[[i[0] for i in list_of_words_frequency].index(word)][1] += 1



    list_of_words_frequency = sorted(list_of_words_frequency, key=lambda l: l[1], reverse=True)



    number_of_stop_words = 10

    stop_words = [i[0] for i in list_of_words_frequency][:number_of_stop_words]



    with open('data/stop_words_english.txt', 'w', encoding='utf-8') as f:

        for page in stop_words:

            f.write("%s\n" % page)



    tokenized_lemmatized_removed_stop_words = []

    for data in tokenized_lemmatized:

        lemmatized_tokenized_removed_stop_words_desc = [word for word in data[0] if word not in stop_words]



        lemmatized_tokenized_removed_stop_words_title = [word for word in data[1] if word not in stop_words]



        tokenized_lemmatized_removed_stop_words.append(

            [lemmatized_tokenized_removed_stop_words_desc, lemmatized_tokenized_removed_stop_words_title])

    return tokenized_lemmatized_removed_stop_words





def add_id_english(tokenized_lemmatized_removed_stop_words):

    merged_id_english = []



    for i in range(len(tokenized_lemmatized_removed_stop_words)):

        merged_id_english.append(

            [i + 1, tokenized_lemmatized_removed_stop_words[i][0], tokenized_lemmatized_removed_stop_words[i][1]])

    return merged_id_english





def preProcess(data,words):

    if(words is not None):

        data_desc_title = data[['description', 'title']]

        data_desc_title_ls = data_desc_title.values.tolist()

        tokenized_lemmatized = prepare_text_english(data)

        tokenized_lemmatized_removed_stop_words = remove_stop_words_english(tokenized_lemmatized)

        merged_id_english = add_id_english(tokenized_lemmatized_removed_stop_words)

        dic={}

        for i in range(len(merged_id_english)):

            dic[data_desc_title_ls[i][0], data_desc_title_ls[i][1]] = merged_id_english[i][1], merged_id_english[i][2]

        corpes = []

        for i in tokenized_lemmatized_removed_stop_words:

            corpes.extend(i[0])

            corpes.extend(i[1])

        corpes = set(corpes)

        return dic, corpes

    else:

        res = []

        for i in data_desc_title_ls:

            decs = []

            for j in i[0].split():

                if (j in words):

                    decs.append(j)



            titles = []

            for j in i[1].split():

                if (j in words):

                    titles.append(j)

            res.append([decs, titles])

        return res, words

In [4]:
def make_X(raw_data, words=None, coeff=2):
    
    main_data, words = preProcess(raw_data, words)
    
    n_doc = len(main_data)
    n_word = len(words)
    
    vector_space_title = np.zeros((n_doc, n_word))
    vector_space_text = np.zeros((n_doc, n_word))

    for i in range(n_doc):
        for y in main_data[i]['title']:
            vector_space_title[i, words.index(y)] += 1
        for y in main_data[i]['text']:
            vector_space_text[i, words.index(y)] += 1

    vector_space_title_idfs = np.log10(n_word / np.count_nonzero(vector_space_title, axis=0))
    vector_space_text_idfs = np.log10(n_word / np.count_nonzero(vector_space_text, axis=0))
    
    vector_space_title_tfidfs = np.multiply(vector_space_title_tfs, vector_space_title_idfs)
    vector_space_text_tfidfs = np.multiply(vector_space_text_tfs, vector_space_text_idfs)
    
    final_vector_space = coeff * vector_space_title_tfidfs + vector_space_text_tfidfs
    
    return final_vector_space

In [5]:
def make_y(data):
    data_views = data[['views']].values
    labels = np.where(data_views >= np.median(data_views), 1, -1)
    return lables

In [9]:
train_data_phase2 = pd.read_csv("./data/train.csv")
test_data_phase2 = pd.read_csv("./data/test.csv")
data_phase1 = pd.read_csv("./data/ted_talks.csv")

X_train, words = make_X(train_data_phase2)
X_test = make_X(test_data_phase2, words)
X_phase1 = make_X(data_phase1, words)

y_train = make_y(train_data_phase2)
y_test = make_y(test_data_phase2)

X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

UnboundLocalError: local variable 'data_desc_title_ls' referenced before assignment

In [None]:
def evaluate(y_true, y_predicted):
    
    TP, TN, FP, FN = 0, 0, 0, 0
    for y1, y2 in zip(y_true, y_predicted):
        if y1 == y2:
            if y1 == 1:
                TP += 1
            elif y2 == -1:
                TN += 1
        elif y1 != y2:
            if y1 == 1:
                FN += 1
            elif y1 == -1:
                FP += 1
    
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    specificity = TN / (TN + FP)
    f1 = (2 * precision * recall) / (precision + recall)
    
    dic = {"Precision": precision, "Recall": recall, "Accuracy": accuracy, "F1_Score": f1,
           "Sensitivity": recall, "Specificity": specificity}
    return dic

In [None]:
def euclidianDistance(sample1, sample2):
    dis = np.linalg.norm(sample1 - sample2)
    return dis

def euclidianDistance2(sample1, sample2):
    distance = 0
    for i in range(len(sample1)):
        distance += pow(sample1[i] - sample2[i], 2)
    distance = math.sqrt(distance)
    return distance


class KNN:

    def __init__(self, k, dis):
        self.k = k
        self.dis = dis
        self.X = []
        self.y = []

    def distance(self, sample1, sample2, distance_type):
        if distance_type == 'euclidian':
            return euclidianDistance(sample1, sample2)
        
    def fit(self, X, y):
        self.X = X
        self.y = y

    def predict(self, test):

        predict = []
        
        for x in test:

            temp = []
            for y in self.X:
                temp += [self.distance(x, y, self.dis)]

            temp = list(zip(temp, self.y))
            temp = sorted(temp, key=lambda a_entry: a_entry[0])
            temp = np.array(temp)
            temp = temp[:self.k, 1]
            temp = np.unique(temp, return_counts=True)
            predict += [temp[0][temp[1].argmax()]]
        
        return np.array(predict)

In [None]:
K = [1, 5, 9]

best_acc1 = 0.0
best_value = 0
best_clf_knn = None

for k in K:
    
    clf = KNN(k=k, dis='euclidian')
    clf.fit(X_train, y_train)
    predicted = clf.predict(X_validation)
    acc = evaluate(y_validation, predicted)["Accuracy"]
    
    if acc >= best_acc1:
        best_acc1 = acc
        best_value = c
        best_clf_knn = clf
        
print(best_acc1)
print(best_value)

In [None]:
predicted = best_clf_knn.predict(X_test)
print(evaluate(y_test, predicted))

In [None]:
class Classifier:
    def __init__(self, train_features, train_labels, test_features=None, test_labels=None):
        self.train_features = train_features
        self.train_labels = train_labels

        self.test_features = test_features
        self.test_labels = test_labels
        self.test_predictions = None

    def train(self):
        pass

    def predict(self, test_features):
        pass

def gaussian_probability(x, mean, stdev):
    stdev += 1
    exponent = exp(-((x - mean) ** 2 / (2 * stdev ** 2)))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

class NaiveBayes(Classifier):
    
    def __init__(self, train_features, train_labels, test_features=None, test_labels=None):
        super().__init__(train_features, train_labels, test_features, test_labels)
        self.label_feature_summaries = dict()
        self.label_dictionary = dict()

    def create_label_dictionary(self):
        for label, features in zip(self.train_labels, self.train_features):
            if label not in self.label_dictionary:
                self.label_dictionary[label] = []
            self.label_dictionary[label] += [features]

    def summarize_dataset(self, label, documents):
        summaries = [(np.mean(column), np.std(column), len(column)) for column in zip(*documents)]
        self.label_feature_summaries[label] = summaries

    def create_label_summaries(self):
        for label in self.label_dictionary:
            self.summarize_dataset(label, self.label_dictionary[label])

    def train(self):
        self.create_label_dictionary()
        self.create_label_summaries()
        return

    def predict_one(self, ind, test):
        total_doc_count = len(self.train_features)
        class_probabilities = dict()
        for label in self.label_feature_summaries:
            label_summary = self.label_feature_summaries[label]
            class_probabilities[label] = log(label_summary[0][-1] / total_doc_count)
            for feature, feature_summary in zip(test, label_summary):
                mean, std, _ = feature_summary
                class_probabilities[label] += log(gaussian_probability(feature, mean, std))
        self.test_predictions[ind] = max(class_probabilities.items(), key=itemgetter(1))[0]

    def predict(self, test_features=None):
        if self.test_features is None:
            self.test_features = test_features
        self.test_predictions = [None for _ in self.test_features]
        for i in range(len(self.test_features)):
            self.predict_one(i, self.test_features[i])

In [None]:
class NaiveBayes2:
    
    def __init__(self, laplace_smooth=0, c=0):
        self.laplace_smooth = laplace_smooth
        self.c = c
        self.p_f_x = []
        self.p_y = []
        self.num_class = 0
    
    def fit(self, X, y):
    
        self.num_class = len(list(set(y)))
        numbers = [[] for _ in range(self.num_class)]
        for x, z in zip(X, y):
            numbers[z] += [list(x)]

        self.p_y = []
        for x in numbers:
            self.p_y += [(len(x) + self.laplace_smooth) / (len(y) + len(numbers) * self.laplace_smooth)]

        self.p_f_x = [[self.laplace_smooth] * len(X[0]) for _ in range(self.num_class)]
        for i in range(self.num_class):
            for j in range(len(X[0])):
                for x in numbers[i]:
                    if x[j] > self.c:
                        self.p_f_x[i][j] += 1
                self.p_f_x[i][j] /= len(numbers[i])

    def predict(self, test):
    
        predict = []
        for x in test:
            res = []
            for i in range(self.num_class):
                p = self.p_y[i]
                for j in range(len(x)):
                    if x[j] > self.c:
                        p *= self.p_f_x[i][j]
                    else:
                        p *= 1 - self.p_f_x[i][j]
                res += [p]
            predict += [res]

        predict = np.array(predict)
        predict = np.argmax(predict, axis=1)
        return predict

In [None]:
L = [0.1, 0.3, 0.5]

best_acc2 = 0.0
best_value = 0
best_clf_naive_bayes = None

for l in L:
    
    clf = NaiveBayes2(laplace_smooth=l, c=0)
    clf.fit(X_train, y_train)
    predicted = clf.predict(X_validation)
    acc = evaluate(y_validation, predicted)["Accuracy"]
    
    if acc >= best_acc2:
        best_acc2 = acc
        best_value = c
        best_clf_naive_bayes = clf
        
print(best_acc2)
print(best_value)

In [None]:
predicted = best_clf_naive_bayes.predict(X_test)
print(evaluate(y_test, predicted))

In [None]:
C_List = [1/2, 1, 3/2, 2]‬

best_acc3 = 0.0
best_value = 0
best_clf_svc = None


for c in C_LIST:
    
    clf = svm.SVC(C=c)
    clf.fit(X_train, y_train)
    predicted = clf.predict(X_validation)
    acc = evaluate(y_validation, predicted)["Accuracy"]
    
    if acc >= best_acc3:
        best_acc3 = acc
        best_value = c
        best_clf_svc = clf
        
print(best_acc3)
print(best_value)

In [None]:
predicted = best_clf_svc.predict(X_test)
print(evaluate(y_test, predicted))

In [None]:
N_Estimators = [50, 100, 200, 500]

best_acc4 = 0.0
best_value = 0
best_clf_random_forest = None

for n in N_Estimators:
    
    clf = RandomForestClassifier(n_estimators=n)
    clf.fit(X_train, y_train)
    predicted = clf.predict(X_validation)
    acc = evaluate(y_validation, predicted)["Accuracy"]
    
    if acc >= best_acc4:
        best_acc4 = acc
        best_value = c
        best_clf_random_forest = clf
        
print(best_acc4)
print(best_value)

In [None]:
predicted = best_clf_random_forest.predict(X_test)
print(evaluate(y_test, predicted))

In [None]:
accs = [best_acc1, best_acc2, best_acc3, best_acc4]
best_index = np.argmax(accs)
print(accs[best_index])

best_clf = None
if best_index == 0:
    best_clf = best_clf_knn
elif best_index == 1:
    best_clf = best_clf_naive_bayes
elif best_index == 1:
    best_clf = best_clf_svc
elif best_index == 1:
    best_clf = best_clf_random_forest

In [None]:
y_phase1 = best_clf.predict(X_phase1)

In [None]:
def stringSearch(string, X, y, words, index=0, ktop=20):
    
    string_split = string.split(" ")
    words_dic_tfidfs = {x:string_split.count(x) for x in string_split}
                                       
    scores = {}
    for i in range(X):
        s = 0
        for y in words_dic_norm:
            try:
                s += words_dic_tfids[y] * X[i][words.index(y)]
            except ValueError:
                continue
        scores[i] = s
    
    if index == 0:
        final = sorted(scores, key=scores.get, reverse=True)[:ktop]
    else:
        semi_final = sorted(scores, key=scores.get, reverse=True)
        final = []
        for x in semi_final:
            if y[x] == index:
                final += [x]
        final = final[:ktop]
    
    return final

In [None]:
while True:
    
    print("Please enter 0 if you do not want to use view, 1 and -1 if you want to use, and 404 if you want to exit!")
    
    s = input()
    
    if s == '0':
        print("Please enter your text.")
        string = input()
        
        output = stringSearch(string, X_phase1, y_phase1, words, index=0)
        print(output)
        
    elif s == '1':
        print("Please enter your text.")
        string = input()
        
        output = stringSearch(string, X_phase1, y_phase1, words, index=1)
        print(output)
        
    elif s == '-1':
        print("Please enter your text.")
        string = input()
        
        output = stringSearch(string, X_phase1, y_phase1, words, index=-1)
        print(output)
        
    elif s == '404':
        print("END.")
        break
        
    else:
        print("Please enter a number according to the guideline!")