In [None]:
import pandas as pd
import numpy as np

# Package for Classification

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import (precision_recall_curve, auc, confusion_matrix,
                             f1_score, fbeta_score, precision_score,
                             recall_score, classification_report)
from sklearn.svm import LinearSVC
from numpy import array
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectPercentile as SP

# For SVD
from scipy.linalg import svd
from sklearn.decomposition import TruncatedSVD

# For Cross-Validation
from sklearn.model_selection import StratifiedKFold

# For Doc2Vec
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec
import nltk
nltk.download('punkt')

# For load data
from google.colab import drive
from shutil import copyfile
from shutil import copytree

# Set Random Seed
import random
random.seed(10)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Link Google Drive account
drive.mount('/content/gdrive')

copyfile('gdrive/My Drive/Progetto_TMeS/data_preprocessed_stopwords_class.csv', 'data_preprocessed_stopwords_class.csv')

Mounted at /content/gdrive


'data_preprocessed_stopwords_class.csv'

In [None]:
# Load Data
data=pd.read_csv('data_preprocessed_stopwords_class.csv')

In [None]:
# For Topic Classification we need only Token Words and Labels
data=data[['stem_token_space','lemm','topic']]

In [None]:
# Define function for vanilla accuracy
def results(y_test,y_pred):
  # RESULTS
  matrix = confusion_matrix(y_test, y_pred)
  # Vanilla Accuracy: sum of elements on diagonal/all elements
  return matrix.diagonal().sum()/matrix.sum()


#Doc2Vec

In [None]:
y=data.topic.values

In [None]:
# Lemmatization
X = data.lemm.values
# Split Train and Test, Test is 20% of original Data 
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.20, random_state=10)

In [None]:
tags_index = {'a': 1 , 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g':7, 'h':8}

In [None]:
train_documents = [TaggedDocument(words=_d.split(), tags=[tags_index.get(y_train[i], 8)]) for i, _d in enumerate(X_train)]
test_documents = [TaggedDocument(words=_d.split(), tags=[tags_index.get(y_test[i], 8)]) for i, _d in enumerate(X_test)]

In [None]:
# Doc2Vec
# Create Vocabolary and define model
model = gensim.models.doc2vec.Doc2Vec(dm=1,vector_size=350, min_count=1, epochs=30)
from tqdm import tqdm
model.build_vocab(([x for x in tqdm(train_documents)]))


100%|██████████| 36021/36021 [00:00<00:00, 2047056.76it/s]


In [None]:
# Train model
model.train(train_documents,total_examples=len(train_documents), epochs=30)

In [None]:
# Function for apply Doc2Vec to train and test set
def vector_for_learning(model, input_docs):
    sents = input_docs
    targets, feature_vectors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, feature_vectors

In [None]:
# Apply Doc2Vec
y_train, X_train_d2v = vector_for_learning(model, train_documents)
y_test, X_test_d2v = vector_for_learning(model, test_documents)


In [None]:
# Classification with random Forest
clf = RandomForestClassifier()
clf.fit(X_train_d2v, y_train)
y_pred = clf.predict(X_test_d2v)
# Obtain confusion Matrix
matrix=confusion_matrix(y_test, y_pred)
matrix

array([[ 227,   71,   23,   67,  169,   81,   97,   45],
       [  22,  202,    7,  133,  331,  243,  168,   79],
       [  40,   47,  135,   64,   28,   60,   72,   22],
       [  16,   81,    4,  732,  126,  109,   57,   30],
       [  22,   88,    0,   55, 1016,  136,   88,   38],
       [  21,   99,    0,   71,  282,  635,  114,   89],
       [  15,   53,    4,   44,  114,   92,  816,  270],
       [  12,   45,    1,   25,   75,  106,  383,  609]])

In [None]:
# Vanilla Accuracy for Doc2Vec
results(y_test, y_pred)

0.48545414168332224

#Classification

## With Lemmatization

In [None]:
ds=data[['lemm','topic']]
y=ds.topic.values

In [None]:
# Lemmatization
XX_l=ds.lemm.values
# Dictionary for results
result_lemma={}

result_dt={}
result_rf={}
result_knn={}
result_svm={}

X_data, X_test, y_data, y_test=train_test_split(XX_l, y, test_size=0.20, random_state=20)

X = X_data
# Label 
y = y_data

# Different N-grams
n_grams=[(1,1),(1,2),(2,2)]

for n_gram in n_grams:
  
  # Different number of features
  for maxf in [1000,5000,15000]:

    # Tf-Idf Representation with "i-grams" (1-grams and 2-grams)
    tfidf_vect = TfidfVectorizer(min_df=2, max_df=0.4, max_features=maxf, ngram_range=n_gram)
    
    # Cross Validation (k=3)
    skf = StratifiedKFold(n_splits=3)

    k=0

    for train_index, val_index in skf.split(X, y):
      
      X_train, X_val = X[train_index], X[val_index]
      y_train, y_val = y[train_index], y[val_index]
      
      # Fit Tf-Idf on Train set
      X_train_transformed = tfidf_vect.fit_transform(X_train)

      # Apply Tf-Idf model on Test
      X_val_transformed = tfidf_vect.transform(X_val)

      # DECISION TREE
      clf = DecisionTreeClassifier().fit(X_train_transformed, y_train)
      y_pred = clf.predict(X_val_transformed)
      
      # RESULTS
      result_dt[(k,n_gram,maxf)]=results(y_val,y_pred)

      # RANDOM FOREST
      clf = RandomForestClassifier().fit(X_train_transformed, y_train)
      y_pred = clf.predict(X_val_transformed)
      
      # RESULTS
      
      result_rf[(k,n_gram,maxf)]=results(y_val,y_pred)
      
      # KNN

      clf = KNeighborsClassifier(n_neighbors=100).fit(X_train_transformed, y_train)
      y_pred = clf.predict(X_val_transformed)

      # RESULTS
      
      result_knn[(k,n_gram,maxf)]=results(y_val,y_pred)

      #SVM

      # Reduce dimension with SVD for SVM (150 for high computational cost)
      svd = TruncatedSVD(n_components=150,random_state=20)
      svd.fit(X_train_transformed)
      X_train_svd= svd.transform(X_train_transformed)

      # Fit SVD for Test set
      svd.fit(X_val_transformed)
      X_val_svd= svd.transform(X_val_transformed)

      # Train SVM
      clf = LinearSVC()
      clf.fit(X_train_svd, y_train)

      y_pred = clf.predict(X_val_svd)

      # RESULTS
      result_svm[(k,n_gram,maxf)]=results(y_val,y_pred)
      
      k=k+1







In [None]:
result_lemma={'DT':result_dt,'RF':result_rf,'SVM':result_svm,'KNN':result_knn}
risultati

In [None]:
risultati['lemmatization']={}
for n_gram in n_grams:

  for maxf in [1000,5000,15000]:
    for tecnica in ['DT','RF','SVM','KNN']:
      # Obtain mean vanialla accuracy after Cross Validation
      risultati['lemmatization'][(tecnica,n_gram,maxf)]=round((result_lemma[tecnica][(0,n_gram,maxf)]+
                      result_lemma[tecnica][(1,n_gram,maxf)]+result_lemma[tecnica][(2,n_gram,maxf)])/3,3)

## With Stemming

In [None]:
# Stemming
ds=data[['stem_token_space','topic']]
y=ds.topic.values
XX_s=ds.stem_token_space.values

# Dictionary for results
result_stemm={}

result_dt={}
result_rf={}
result_knn={}
result_svm={}

X_data, X_test, y_data, y_test=train_test_split(XX_s, y, test_size=0.20, random_state=20)

X = X_data
# Label 
y = y_data

# Different n-grams that give better results with lemmatization
n_grams=[(1,1),(1,2)]

for n_gram in n_grams:

  # Select number of features that maximize accuracy with lemmatization
  for maxf in [15000]:

    # Tf-Idf Representation with "i-grams" (1-grams and 2-grams)
    tfidf_vect = TfidfVectorizer(min_df=2, max_df=0.4, max_features=maxf, ngram_range=n_gram)
    
    # Cross Validation (k=3)
    skf = StratifiedKFold(n_splits=3)

    k=0

    for train_index, val_index in skf.split(X, y):
      
      X_train, X_val = X[train_index], X[val_index]
      y_train, y_val = y[train_index], y[val_index]
      
      # Fit Tf-Idf on Train set
      X_train_transformed = tfidf_vect.fit_transform(X_train)

      # Apply Tf-Idf model on Test
      X_val_transformed = tfidf_vect.transform(X_val)

      # DECISION TREE
      clf = DecisionTreeClassifier().fit(X_train_transformed, y_train)
      y_pred = clf.predict(X_val_transformed)
      
      # RESULTS
      result_dt[(k,n_gram,maxf)]=results(y_val,y_pred)

      # RANDOM FOREST
      clf = RandomForestClassifier().fit(X_train_transformed, y_train)
      y_pred = clf.predict(X_val_transformed)
      
      # RESULTS
      
      result_rf[(k,n_gram,maxf)]=results(y_val,y_pred)
      
      # KNN

      clf = KNeighborsClassifier(n_neighbors=100).fit(X_train_transformed, y_train)
      y_pred = clf.predict(X_val_transformed)

      # RESULTS
      
      result_knn[(k,n_gram,maxf)]=results(y_val,y_pred)

      #SVM

      # Reduce dimension with SVD for SVM (150 for high computational cost)
      svd = TruncatedSVD(n_components=150,random_state=20)
      svd.fit(X_train_transformed)
      X_train_svd= svd.transform(X_train_transformed)

      # Fit SVD for Test set
      svd.fit(X_val_transformed)
      X_val_svd= svd.transform(X_val_transformed)

      # Train SVM
      clf = LinearSVC()
      clf.fit(X_train_svd, y_train)

      y_pred = clf.predict(X_val_svd)

      # RESULTS
      result_svm[(k,n_gram,maxf)]=results(y_val,y_pred)
      
      k=k+1


In [None]:
result_stemm={'DT':result_dt,'RF':result_rf,'SVM':result_svm,'KNN':result_knn}

In [None]:
risultati['stemm']={}
for n_gram in n_grams:
  maxf=15000
  for tecnica in ['DT','RF','SVM','KNN']:
    # Mean of Vanilla Accuracy
    risultati['stemm'][(tecnica,n_gram,maxf)]=round((result_stemm[tecnica][(0,n_gram,maxf)]+
                      result_stemm[tecnica][(1,n_gram,maxf)]+result_stemm[tecnica][(2,n_gram,maxf)])/3,3)

## Final Test


In [None]:
# Apply the best model to Test Set
ds=data[['lemm','topic']]
y=ds.topic.values
XX_l=ds.lemm.values

X_data, X_test, y_data, y_test=train_test_split(XX_l, y, test_size=0.20, random_state=20)

X = X_data
# Label 
y = y_data

# Tf-Idf Representation with 1-grams and 2-grams and 15000 max_features
tfidf_vect = TfidfVectorizer(min_df=2, max_df=0.4, max_features=15000, ngram_range=(1,2))
    
# Fit Tf-Idf on Train set
X_train_transformed = tfidf_vect.fit_transform(X_data)

# Apply Tf-Idf model on Test
X_test_transformed = tfidf_vect.transform(X_test)

# The best model is RANDOM FOREST
clf = RandomForestClassifier().fit(X_train_transformed, y_data)
y_pred = clf.predict(X_test_transformed)
      
# RESULTS (Final Vanilla Accuracy)
print(results(y_test,y_pred))

0.6547856984232734
