In [None]:
from google.colab import drive
drive.mount('/content/drive')

# importing modules

In [None]:
import numpy as np
import pandas as pd
import random
from gensim.models import word2vec
from gensim.models import KeyedVectors
from itertools import chain 
from collections import Counter 

file_path = '/content/drive/My Drive/My Projects/FYP/Sentiment Lexicon/Implementation/Model3 - Lankadeepa News Comments/'

# 1. remove punctuation marks, numbers, foreign words and special characters

In [None]:
comments_df = pd.read_csv(file_path + '0_comments_all.csv', header=0, delimiter=';', quoting=3)
comments = comments_df['comment']
words_list = []
for comment in comments:
  words = str(comment).strip().split()
  for word in words:
    new_word = ''
    for character in word:
      if 3456 <= ord(character) and ord(character) <= 3583:
        continue
        new_word += character
      else:
        if len(new_word) > 0:
          words_list.append(new_word)
        new_word = ''
    else:
    if len(new_word) > 0:
      words_list.append(new_word)

noise_removed_words_df = pd.DataFrame(words_list, columns=['word'])
noise_removed_words_df.to_csv(file_path + '1_removed_noise.csv', sep=',', encoding='utf-8', index=False)
print('created 1_removed_noise.csv with %i words from %i comments' % (noise_removed_words_df.shape[0], comments_df.shape[0]))

created 1_removed_noise.csv with 4680274 words from 290189 comments


# 2. remove duplicates and find frequencies

In [None]:
frequencies = {}

for word in words_list:
  if word not in frequencies.keys():
    frequencies[word] = 1
  else:
    frequencies[word] += 1

fo = open(file_path +'untagged.txt', 'w')
for word in frequencies.keys():
  fo.write(word + '\n')
fo.close()

unique_words_df = pd.DataFrame(frequencies.items(), columns=['word', 'frequency'])
unique_words_df.to_csv(file_path + '2_removed_duplicates.csv', sep=',', encoding='utf-8', index=False)

print('created 2_removed_duplicates.csv with %i unique words from %i words' % (len(frequencies), len(words_list)))

created 2_removed_duplicates.csv with 223338 unique words from 4680274 words


# 3. filter words by POS tag (adjectives, adverbs)

In [None]:
# JJ  Adjective
# JCV Adjective in Compound Verbs
# RB  Adverb
# NIP Nipathana
# AUX Modal Auxiliary

fo = open(file_path + 'tagged.txt', 'r', encoding='utf-8')
lines = fo.readlines()
fo.close()

pos_tags = {}
for line in lines:
  if line.startswith('%%'):
    pass
  else:
    word, pos_tag = line.strip().split()
    if pos_tag in ['JJ', 'JCV', 'NIP', 'AUX', 'RB']:
      pos_tags[word] = pos_tag

filtered_pos_tagged_words_df = pd.DataFrame(pos_tags.items(), columns=['word', 'pos_tag'])
filtered_pos_tagged_words_df = unique_words_df.merge(filtered_pos_tagged_words_df, how='inner', on='word')
filtered_pos_tagged_words_df = filtered_pos_tagged_words_df.sort_values('word')
filtered_pos_tagged_words_df.to_csv(file_path + '3_filtered_by_pos_tag.csv', sep=',', encoding='utf-8', index=False)

print('created 3_filtered_by_pos_tag.csv with %i words' % len(filtered_pos_tagged_words_df))

created 3_filtered_by_pos_tag.csv with 9257 words


In [None]:
fo = open(file_path + 'tagged.txt', 'r', encoding='utf-8')
lines = fo.readlines()
fo.close()

fo = open(file_path + 'lankadeepa_pos_tagged_words.csv', 'w', encoding='utf-8')
fo.write('word,pos_tag\n')
for line in lines:
  if line.startswith('%%'):
    pass
  else:
    word, pos_tag = line.strip().split()
    fo.write(word + ',' + pos_tag + '\n')
fo.close()

In [None]:
df = pd.read_csv(file_path + 'lankadeepa_pos_tagged_words.csv', header=0)
df.head()

# 4. filter words by synset

In [None]:
words = filtered_pos_tagged_words_df['word']

# ==============================================================================
# fixed length stem approach 
# ==============================================================================

# n = 5
# stem = words[0][:n-1]
# group = []
# for word in words:
#   if stem in word:
#     group.append(word)
#   else:
#     if len(word) > n:
#       stem = word[:n-1]
#       print(group)
#       group = [word]
#     else:
#       print([word])

# ==============================================================================
# dynamic length stem approach 
# ==============================================================================

synset_filtered_words = []
stem = words.iloc[0]
group = []
for word in words:
  if word.startswith(stem):
    group.append(word)
  else:
    max_word = group[0]
    max_frequency = frequencies[group[0]]
    for item in group:
      if frequencies[item] > max_frequency:
        max_word = item
        max_frequency = frequencies[item]
    synset_filtered_words.append(max_word)
    stem = word
    group = [word]

filtered_synset_words_df = pd.DataFrame(synset_filtered_words, columns=['word'])
filtered_synset_words_df = filtered_pos_tagged_words_df.merge(filtered_synset_words_df, how='inner', on='word')
filtered_synset_words_df = filtered_synset_words_df.sort_values('frequency', ascending=False)
filtered_synset_words_df.to_csv(file_path + '4_filtered_by_synset.csv', sep=',', encoding='utf-8', index=False)

print('created 4_filtered_by_synset.csv with %i words' % len(filtered_synset_words_df))

created 4_filtered_by_synset.csv with 5841 words


# 5. filter words by average frequency

In [None]:
total = 0
for word in frequencies.keys():
  total += frequencies[word]
average_frequency = total / len(frequencies.keys())
print(average_frequency, len(frequencies.keys()))

words = filtered_synset_words_df['word']
frequency_filtered_words = []
for word in words:
  if frequencies[word] > average_frequency:
    frequency_filtered_words.append(word)

filtered_frequency_words_df = pd.DataFrame(frequency_filtered_words, columns=['word'])
filtered_frequency_words_df = filtered_synset_words_df.merge(filtered_frequency_words_df, how='inner', on='word')
filtered_frequency_words_df = filtered_frequency_words_df.sort_values('word', ascending=True)
filtered_frequency_words_df.to_csv(file_path + '5_filtered_by_frequency.csv', sep=',', encoding='utf-8', index=False)

print('created 5_filtered_by_frequency.csv with %i words' % len(filtered_frequency_words_df))

# 6. calculate Cohen's kappa value for final data set



In [None]:
import gspread
from oauth2client.client import GoogleCredentials
from google.colab import auth
auth.authenticate_user()
gc = gspread.authorize(GoogleCredentials.get_application_default())

In [None]:
spreadsheet = gc.open('5_filtered_by_frequency')
worksheet = spreadsheet.worksheet('Sheet1')
words = worksheet.col_values(1)[1:]
binod = worksheet.col_values(2)[1:]
udyogi = worksheet.col_values(3)[1:]
frequency = worksheet.col_values(4)[1:]

In [None]:
p = 0
n = 0
for sentiment in udyogi:
  if sentiment == '1':
    p += 1
  elif sentiment == '-1':
    n += 1
print("Total sentiment tagged words -", p + n)
print("Positive words -", p)
print("Negative words -", n)

y1 = binod
y2 = udyogi
from sklearn.metrics import cohen_kappa_score
kappa = cohen_kappa_score(y1, y2, labels=None, weights=None)
print("Cohen's Kappa Score -", kappa)

# 7. model

In [None]:
################################################################################
# Loading word embedding models
################################################################################
from gensim.models import word2vec
from gensim.models import FastText
word2vec_model = word2vec.Word2Vec.load('/content/drive/My Drive/My Projects/FYP/Sentiment Lexicon/Implementation/Model1 - Word Embeddings/word2vec_model3/word2vec.model')
fasttext_model = FastText.load('/content/drive/My Drive/My Projects/FYP/Sentiment Lexicon/Implementation/Model1 - Word Embeddings/fasttext_model3/fasttext.model')

In [None]:
################################################################################
# Model
################################################################################
def sentiment_polarity(word, seed_set, model, threshold_polarity=0, threshold_similarity=0):
  # find score
  n = len(seed_set)
  total_similarity = 0
  for seed_word in seed_set.keys():
    similarity = 0
    if model == 'word2vec':
      similarity = seed_set[seed_word] * word2vec_model.similarity(word, seed_word)
    if model == 'fasttext':
      similarity = seed_set[seed_word] * fasttext_model.similarity(word, seed_word)
    if abs(similarity) > threshold_similarity:
      total_similarity += similarity
  score = total_similarity / n

  # find polarity
  polarity = 1 if score >= threshold_polarity else -1

  return polarity

In [None]:
################################################################################
# Performance
################################################################################
from sklearn import metrics
def evaluate(y_true, y_pred):
  y_true_list = y_true
  y_pred_list = y_pred

  classification_report = metrics.classification_report(y_true_list, y_pred_list, digits=4, output_dict=True)
  results = classification_report['macro avg']
  results['accuracy'] = classification_report['accuracy']

  return results

# 8. evaluating test set

In [None]:
import gspread
from oauth2client.client import GoogleCredentials
from google.colab import auth
auth.authenticate_user()
gc = gspread.authorize(GoogleCredentials.get_application_default())

In [None]:
spreadsheet = gc.open('6_seed_words')
worksheet = spreadsheet.worksheet('6_seed_words')
words = worksheet.col_values(1)[1:]
scores = worksheet.col_values(2)[1:]
frequencies = worksheet.col_values(3)[1:]

positive_words = []
negative_words = []

for i in range(len(words)):
  if scores[i] == '1':
    positive_words.append(words[i])
  else:
    negative_words.append(words[i])

In [None]:
spreadsheet = gc.open('7_test_set')
# worksheet = spreadsheet.worksheet('7_test_set')
worksheet = spreadsheet.worksheet('test_set_revised')
test_words = worksheet.col_values(1)
test_scores = worksheet.col_values(2)

test_set = {}
for i in range(len(test_words)):
  test_set[test_words[i]] = int(test_scores[i])

In [None]:
Y1_acc = []
Y1_pre = []
Y1_rec = []
Y1_f1s = []

for size in range (2, 400, 2):
  seed_set = {}
  for i in range(int(size/2)):
    seed_set[positive_words[i]] = 1
    seed_set[negative_words[i]] = -1
  y_true = []
  y_pred = []
  for test_word in test_set.keys():
    y_true.append(test_set[test_word])
    y_pred.append(sentiment_polarity(test_word, seed_set, 'word2vec'))
  results = evaluate(y_true, y_pred)
  Y1_acc.append(results['accuracy'])
  Y1_pre.append(results['precision'])
  Y1_rec.append(results['recall'])
  Y1_f1s.append(results['f1-score'])

In [None]:
Y2_acc = []
Y2_pre = []
Y2_rec = []
Y2_f1s = []

for size in range (356, 358, 2):
  seed_set = {}
  for i in range(int(size/2)):
    seed_set[positive_words[i]] = 1
    seed_set[negative_words[i]] = -1
  y_true = []
  y_pred = []
  for test_word in test_set.keys():
    y_true.append(test_set[test_word])
    y_pred.append(sentiment_polarity(test_word, seed_set, 'fasttext'))
  results = evaluate(y_true, y_pred)
  Y2_acc.append(results['accuracy'])
  Y2_pre.append(results['precision'])
  Y2_rec.append(results['recall'])
  Y2_f1s.append(results['f1-score'])

In [None]:
import matplotlib.pyplot as plt

X = range(2, 400, 2)
Y1 = Y1_f1s
Y2 = Y2_f1s
# plt.plot(X, Y1, color='r', label='Word2Vec')
plt.plot(X, Y2, color='b', label='FastText')
plt.plot([0, 400], [max(Y2_f1s)+0.005, max(Y2_f1s)+0.005], color='r', linestyle='-', linewidth=1)
plt.title('No of seed words vs F1-Score')
plt.xlabel('no of seed words')
plt.ylabel('f1-score')
plt.axis([0, 400, 0.5, 1])
plt.legend(loc="upper right")
plt.show()

In [None]:
# max(Y2_f1s)
# Y2_f1s.index(max(Y2_f1s))
print(Y2_acc)
print(Y2_pre)
print(Y2_rec)
print(Y2_f1s)

In [None]:
seed_set = {}
for i in range(74):
  seed_set[positive_words[i]] = 1
  seed_set[negative_words[i]] = -1

thresholds_polarity = np.arange(-0.002, 0.003, 0.001)
thresholds_similarity = np.arange(0.00, 0.02, 0.01)

max_x = 0
max_y = 0
max_z = 0
K = {}
rows = []
for threshold_polarity in thresholds_polarity:
  print(threshold_polarity)
  row = []
  for threshold_similarity in thresholds_similarity:
    # print(threshold_similarity)
    y_true = []
    y_pred = []
    for test_word in test_set.keys():
      y_true.append(test_set[test_word])
      y_pred.append(sentiment_polarity(test_word, seed_set, 'fasttext', threshold_polarity, threshold_similarity))
    results = evaluate(y_true, y_pred)
    row.append(results['f1-score'])
    if max_z < results['f1-score']:
      max_x = threshold_polarity
      max_y = threshold_similarity
      max_z = results['f1-score']
      K['accuracy'] = results['accuracy']
      K['precision'] = results['precision']
      K['recall'] = results['recall']
      K['f1-score'] = results['f1-score']
  rows.append(row)

In [None]:
import matplotlib.pyplot as plt

X, Y = np.meshgrid(thresholds_polarity, thresholds_similarity)
Z = np.array(rows).T
fig, ax = plt.subplots(1,1)
cp = ax.contourf(X, Y, Z, 16)
# ax.plot([max_x], [max_y], 'ro')
fig.colorbar(cp) # Add a colorbar to a plot
ax.set_title('F1-Score (fasttext)')
ax.set_xlabel('thresholds_polarity')
ax.set_ylabel('threshold_similarity')
plt.show()
print('(%f, %f)' % (max_x, max_y))
print('accuracy: %f\nprecision: %f\nrecall: %f\nf1-score: %f' % (K['accuracy'], K['precision'], K['recall'], K['f1-score']))

# 9. create final lexicon

In [None]:
import gspread
from oauth2client.client import GoogleCredentials
from google.colab import auth
auth.authenticate_user()
gc = gspread.authorize(GoogleCredentials.get_application_default())

In [None]:
spreadsheet = gc.open('7_test_set')
# worksheet = spreadsheet.worksheet('Sheet1')
worksheet = spreadsheet.worksheet('test_set_revised')
senti_words = worksheet.col_values(1)[0:]
senti_scores = worksheet.col_values(2)[0:]
worksheet = spreadsheet.worksheet('Sheet2')
all_words = worksheet.col_values(1)[0:]
all_scores = worksheet.col_values(2)[0:]

In [None]:
lexicon = {}
for i in range(len(senti_words)):
  stem = senti_words[i]
  for word in all_words:
    if word.startswith(stem):
      lexicon[word] = senti_scores[i]

fo = open(file_path + 'lexicon.csv', 'w', encoding='utf-8')
for word in lexicon.keys():
  fo.write(word + ',' + lexicon[word] + '\n')
fo.close()

In [None]:
spreadsheet = gc.open('7_test_set')
worksheet = spreadsheet.worksheet('test_set_revised')
senti_words = worksheet.col_values(1)[0:]
senti_scores = worksheet.col_values(2)[0:]

fo = open(file_path + '3_filtered_by_pos_tag.csv', 'r', encoding='utf-8')
lines = fo.readlines()
fo.close()

words = {}
for line in lines[1:]:
  word, frequency, pos_tag = line.strip().split(',')
  words[word] = int(frequency)

for senti_word in senti_words:
  print(words[senti_word])    

# 10. sentiment analysis using our lexicon

In [None]:
################################################################################
# Loading seed set
################################################################################
fo = open(file_path + '10_lexicon.csv', 'r', encoding='utf-8')
lines = fo.readlines()
fo.close()

seed_set = {}
for line in lines:
  word, score = line.strip().split(',')
  seed_set[word] = int(score)

################################################################################
# Testing
################################################################################
fo = open(file_path + '9_tagged_comments.csv', 'r', encoding='utf-8')
lines = fo.readlines()
fo.close()

y_true = []
y_pred = []
for line in lines:
  docid, comment, tag = line.strip().split(';')
  true_score = 0
  if tag == 'POSITIVE':
    true_score = 1
  elif tag == 'NEGATIVE':
    true_score = -1
  words = comment.split()
  total_score = 0
  for word in words:
    if word in seed_set.keys():
      total_score += seed_set[word]
  pred_score = 0
  if total_score > 0:
    pred_score = 1
  else:
    pred_score = -1
  # print(true_score, pred_score)
  y_true.append(true_score)
  y_pred.append(pred_score)

################################################################################
# Performance
################################################################################
from tabulate import tabulate
from sklearn import metrics

y_true_list = y_true
y_pred_list = y_pred

classification_report = metrics.classification_report(y_true_list, y_pred_list, digits=4, output_dict=True)
results = classification_report['macro avg']
results['accuracy'] = classification_report['accuracy']

rows = [
  ['Accuracy', results['accuracy']],
  ['Precision', results['precision']],
  ['Recall', results['recall']],
  ['F1-Score', results['f1-score']]
]

print(tabulate(rows, tablefmt='github'))

In [None]:
################################################################################
# Loading seed set
################################################################################
fo = open(file_path + '10_lexicon.csv', 'r', encoding='utf-8')
lines = fo.readlines()
fo.close()

seed_set = {}
for line in lines[1:]:
  word, score = line.strip().split(',')
  seed_set[word] = int(score)

################################################################################
# Creating training/test sets
################################################################################
from sklearn.model_selection import train_test_split

fo = open(file_path + '9_tagged_comments.csv', 'r', encoding='utf-8')
lines = fo.readlines()
fo.close()

vectors = []
scores = []
for line in lines[1:]:
  docid, comment, tag = line.strip().split(';')
  score = 0
  if tag == 'POSITIVE':
    score = 1
  elif tag == 'NEGATIVE':
    score = -1
  words = comment.split()
  # vector = []
  # for word in seed_set.keys():
  #   vector.append(seed_set[word] * words.count(word))
  p = 0
  n = 0
  for word in words:
    if word in seed_set.keys():
      s = seed_set[word]
      if s > 0:
        p += s
      else:
        n += s
  vector = [p, n]
  vectors.append(vector)
  scores.append(score)

# X_train, X_test, y_train, y_test = train_test_split(vectors, scores, test_size=0.2, random_state=42)
################################################################################
# Train classifiers
################################################################################
# from sklearn import svm
# clf = svm.SVC()
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)

# from sklearn.naive_bayes import GaussianNB
# gnb = GaussianNB()
# y_pred = gnb.fit(X_train, y_train).predict(X_test)

# from sklearn.neighbors import KNeighborsClassifier
# neigh = KNeighborsClassifier(n_neighbors=3)
# neigh.fit(X_train, y_train)
# y_pred = neigh.predict(X_test)

# from sklearn import tree
# clf = tree.DecisionTreeClassifier()
# clf = clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)

# from sklearn.linear_model import SGDClassifier
# clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)

# from sklearn.ensemble import AdaBoostClassifier
# clf = AdaBoostClassifier(n_estimators=100)
# clf = clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)

from sklearn.model_selection import cross_validate
from sklearn import svm
clf = svm.SVC(kernel='linear', C=1)
scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
scores = cross_validate(clf, vectors, scores, cv=5, scoring=scoring)

# from sklearn.model_selection import cross_validate
# from sklearn.naive_bayes import GaussianNB
# gnb = GaussianNB()
# scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
# scores = cross_validate(gnb, vectors, scores, cv=5, scoring=scoring)

# from sklearn.model_selection import cross_validate
# from sklearn import tree
# clf = tree.DecisionTreeClassifier()
# scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
# scores = cross_validate(clf, vectors, scores, cv=5, scoring=scoring)

print(scores)
print('Accuracy -', sum(scores['test_accuracy'])/5)
print('Precision -', sum(scores['test_precision_macro'])/5)
print('Recall -', sum(scores['test_recall_macro'])/5)
print('F1-Score -', sum(scores['test_f1_macro'])/5)

################################################################################
# Performance
################################################################################
# from tabulate import tabulate
# from sklearn import metrics

# y_true_list = y_test
# y_pred_list = y_pred

# classification_report = metrics.classification_report(y_true_list, y_pred_list, digits=4, output_dict=True)
# results = classification_report['macro avg']
# results['accuracy'] = classification_report['accuracy']

# rows = [
#   ['Accuracy', results['accuracy']],
#   ['Precision', results['precision']],
#   ['Recall', results['recall']],
#   ['F1-Score', results['f1-score']]
# ]

# print(tabulate(rows, tablefmt='github'))

In [None]:
################################################################################
# Loading seed set
################################################################################
fo = open(file_path + '10_lexicon.csv', 'r', encoding='utf-8')
lines = fo.readlines()
fo.close()

seed_set = {}
for line in lines[1:]:
  word, score = line.strip().split(',')
  seed_set[word] = int(score)

################################################################################
# Creating training/test sets
################################################################################
fo = open(file_path + '9_tagged_comments.csv', 'r', encoding='utf-8')
lines = fo.readlines()
fo.close()

bow = {}
i = 1
doc_list = []
tag_list = []
for line in lines[1:]:
  docid, comment, tag = line.strip().split(';')
  score = 0
  if tag == 'POSITIVE':
    score = 1
  elif tag == 'NEGATIVE':
    score = -1
  words = comment.split()
  for word in words:
    if word not in bow.keys():
      bow[word] = i
      i += 1
  doc_list.append(comment)
  tag_list.append(score)

vec_list = []
for doc in doc_list:
  vec = []
  words = set(doc.split())
  for word in words:
    if word in seed_set.keys():
      vec.append(bow[word]/20000)
      vec.append(doc.count(word))
      vec.append(seed_set[word])
      # vec.append(0)
    else:
      vec.append(bow[word]/20000)
      vec.append(doc.count(word))
      vec.append(0)
  vec += [0] * (390 - len(vec))
  vec_list.append(np.array(vec))

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vec_list, tag_list, test_size=0.2, random_state=10)
# X_train = np.array(X_train).reshape((4008, 130))
# X_test = np.array(X_test).reshape((1002, 130))
################################################################################
# Train classifiers
################################################################################
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# from sklearn.naive_bayes import GaussianNB
# gnb = GaussianNB()
# y_pred = gnb.fit(X_train, y_train).predict(X_test)

# from sklearn import tree
# clf = tree.DecisionTreeClassifier()
# clf = clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)

# from sklearn.neighbors import KNeighborsClassifier
# neigh = KNeighborsClassifier(n_neighbors=3)
# neigh.fit(X_train, y_train)
# y_pred = neigh.predict(X_test)

################################################################################
# Performance
################################################################################
from tabulate import tabulate
from sklearn import metrics

y_true_list = y_test
y_pred_list = y_pred

classification_report = metrics.classification_report(y_true_list, y_pred_list, digits=4, output_dict=True)
results = classification_report['macro avg']
results['accuracy'] = classification_report['accuracy']

rows = [
  ['Accuracy', results['accuracy']],
  ['Precision', results['precision']],
  ['Recall', results['recall']],
  ['F1-Score', results['f1-score']]
]

print(tabulate(rows, tablefmt='github'))

################################################################################
# Cross validation
################################################################################
# from sklearn.model_selection import cross_validate
# from sklearn import svm
# clf = svm.SVC(kernel='linear', C=1)
# scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
# scores = cross_validate(clf, vec_list, tag_list, cv=5, scoring=scoring)

# from sklearn.model_selection import cross_validate
# from sklearn.naive_bayes import GaussianNB
# gnb = GaussianNB()
# scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
# scores = cross_validate(gnb, vectors, scores, cv=5, scoring=scoring)

# from sklearn.model_selection import cross_validate
# from sklearn import tree
# clf = tree.DecisionTreeClassifier()
# scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
# scores = cross_validate(clf, vectors, scores, cv=5, scoring=scoring)

# print(scores)
# print('Accuracy -', sum(scores['test_accuracy'])/5)
# print('Precision -', sum(scores['test_precision_macro'])/5)
# print('Recall -', sum(scores['test_recall_macro'])/5)
# print('F1-Score -', sum(scores['test_f1_macro'])/5)

In [None]:
print(np.array(X_train).shape)
a = np.array(X_train).reshape((4008,260))