In [4]:
import sys
import os
import time
import pandas as pd
import numpy as np
import collections

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
#from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedKFold
#from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import f1_score

In [123]:
def classifaction_report_dataFrame(report):
    report_data = pd.DataFrame()
    lines = report.split('\n')
    for line in lines[2:-3]:
        row = {}
        row_data = line.split('      ')
        report_data = report_data.append({"class": row_data[0], "precision": float(row_data[1]), "recall": float(row_data[2]), "f1_score": float(row_data[3])}, ignore_index=True)
    return report_data

In [102]:
def prepereDataWithoutLemmatisation():
    data = np.empty((0))
    data_labels = np.empty((0))

    comments = pd.read_csv('FilmWeb-commentsRates.csv')
    comments = comments.reset_index()
    comments.loc[comments.emotion == 'neutral', 'emotion'] = "negative"
    comments.groupby(['emotion']).size()
    for index, comment in comments.iterrows():
        data = np.append(data, comment['Text'])
        data_labels = np.append(data_labels, comment['emotion'])
    return [data, data_labels]

In [201]:
def prepereDataWithLemmatisation():
    data = np.empty((0))
    data_labels = np.empty((0))

    comments = pd.read_csv('FilmWeb-lemmatisation-commentsRates.csv')
    comments = comments.reset_index()
    comments.loc[comments.emotion == 'neutral', 'emotion'] = "negative"
    print comments.groupby(['emotion']).size()
    for index, comment in comments.iterrows():
        data = np.append(data, comment['comment'])
        data_labels = np.append(data_labels, comment['emotion'])
    return [data, data_labels]

In [211]:
def SVM(iterations, dataset, is_shuffle):
    data = dataset[0]
    data_labels = dataset[1]
    report_data_Linear = pd.DataFrame()
    report_data_Liblinear = pd.DataFrame()
    report_data_Liblinear2 = pd.DataFrame()

    #Some classification problems can exhibit a large imbalance in the distribution of the target classes: 
    #for instance there could be several times more negative samples than positive samples.
    #StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately 
    #the same percentage of samples of each target class as the complete set.
    #K-krotna walidacja
    
    #shuffle - przetasowanie zbiorów
    sss = StratifiedKFold(n_splits=iterations, shuffle=is_shuffle)
    classifier_liblinear2 = svm.LinearSVC()
    for train_index, test_index in sss.split(data, data_labels):
        train_data, test_data = data[train_index], data[test_index]
        train_labels, test_labels = data_labels[train_index], data_labels[test_index]

        # Create feature vectors
        
        #min_df - When building the vocabulary ignore terms that have a document frequency strictly lower 
        #than the given threshold. This value is also called cut-off in the literature.
        #If float, the parameter represents a proportion of documents, integer absolute counts.
        #This parameter is ignored if vocabulary is not None.
        
        vectorizer = TfidfVectorizer(min_df=5,max_df = 0.8,sublinear_tf=True,use_idf=True)
        train_vectors = vectorizer.fit_transform(train_data)
        test_vectors = vectorizer.transform(test_data)

        # Perform classification with SVM, kernel=linear
        classifier_linear = svm.SVC(kernel='linear')
        t0 = time.time()
        classifier_linear.fit(train_vectors, train_labels)
        t1 = time.time()
        prediction_linear = classifier_linear.predict(test_vectors)
        t2 = time.time()
        time_linear_train = t1-t0
        time_linear_predict = t2-t1

        # Perform classification with SVM, kernel=linear
        classifier_liblinear = svm.LinearSVC()
        t0 = time.time()
        classifier_liblinear.fit(train_vectors, train_labels)
        t1 = time.time()
        prediction_liblinear = classifier_liblinear.predict(test_vectors)
        t2 = time.time()
        time_liblinear_train = t1-t0
        time_liblinear_predict = t2-t1
        
        # Perform classification with SVM, kernel=linear
        t0 = time.time()
        classifier_liblinear2.fit(train_vectors, train_labels)
        t1 = time.time()
        prediction_liblinear2 = classifier_liblinear.predict(test_vectors)
        t2 = time.time()
        time_liblinear_train2 = t1-t0
        time_liblinear_predict2 = t2-t1

        reportLinear = classification_report(test_labels, prediction_linear)
        reportLiblinear2 = classification_report(test_labels, prediction_liblinear2)
        
        lines = reportLinear.split('\n')
        for line in lines[2:-3]:
            row = {}
            row_data = line.split('      ')
            report_data_Linear = report_data_Linear.append({"class": row_data[0], "precision": float(row_data[1]), "recall": float(row_data[2]), "f1_score": float(row_data[3]), "support": float(row_data[4])}, ignore_index=True)

        lines = reportLiblinear.split('\n')
        for line in lines[2:-3]:
            row = {}
            row_data = line.split('      ')
            report_data_Liblinear = report_data_Liblinear.append({"class": row_data[0], "precision": float(row_data[1]), "recall": float(row_data[2]), "f1_score": float(row_data[3]), "support": float(row_data[4])}, ignore_index=True)
        
        lines = reportLiblinear2.split('\n')
        for line in lines[2:-3]:
            row = {}
            row_data = line.split('      ')
            report_data_Liblinear2 = report_data_Liblinear2.append({"class": row_data[0], "precision": float(row_data[1]), "recall": float(row_data[2]), "f1_score": float(row_data[3]), "support": float(row_data[4])}, ignore_index=True)
        
    print 'Training data:'
    counter=collections.Counter(train_labels)
    print(counter)

    print 'Test data:'
    counter=collections.Counter(test_labels)
    print(counter)
    
    return [report_data_Liblinear, report_data_Linear, report_data_Liblinear2]

In [212]:
iterations = 10
shuffle = True

In [213]:
#SVM without Lemmatisation
dataset = prepereDataWithoutLemmatisation()

print 'number of dataset: ' + str(iterations)
data = SVM(iterations, dataset, shuffle)
print 'Linear report:'
printReportLinear(data)
print 'Liblinear report:'
printReportLiblinear(data)

number of dataset: 10
Training data:
Counter({'positive': 2763, 'negative': 1247})
Test data:
Counter({'positive': 307, 'negative': 138})
Linear report:
          class  f1_score  precision  recall  support
0      negative      0.57       0.57    0.58    461.0
2      negative      0.57       0.57    0.58    461.0
4      negative      0.57       0.57    0.58    461.0
6      negative      0.57       0.57    0.58    461.0
8      negative      0.57       0.57    0.58    461.0
10     negative      0.57       0.57    0.58    461.0
12     negative      0.57       0.57    0.58    461.0
14     negative      0.57       0.57    0.58    461.0
16     negative      0.57       0.57    0.58    461.0
18     negative      0.57       0.57    0.58    461.0
          class  f1_score  precision  recall  support
1      positive      0.81       0.81     0.8   1023.0
3      positive      0.81       0.81     0.8   1023.0
5      positive      0.81       0.81     0.8   1023.0
7      positive      0.81       0.81 

In [215]:
#SVM with Lemmatisation
dataset = prepereDataWithLemmatisation()

print 'number of dataset: ' + str(iterations)
data = SVM(iterations, dataset, shuffle)
print 'Linear report:'
printReportLinear(data)
print 'Liblinear report:'
printReportLiblinear(data)

print 'Liblinear2 report:'
printReportLiblinear2(data)

emotion
negative    1284
positive    2879
dtype: int64
number of dataset: 10
Training data:
Counter({'positive': 2592, 'negative': 1156})
Test data:
Counter({'positive': 287, 'negative': 128})
Linear report:
          class  f1_score  precision  recall  support
0      negative      0.57       0.57    0.58    461.0
2      negative      0.57       0.57    0.58    461.0
4      negative      0.57       0.57    0.58    461.0
6      negative      0.57       0.57    0.58    461.0
8      negative      0.57       0.57    0.58    461.0
10     negative      0.57       0.57    0.58    461.0
12     negative      0.57       0.57    0.58    461.0
14     negative      0.57       0.57    0.58    461.0
16     negative      0.57       0.57    0.58    461.0
18     negative      0.57       0.57    0.58    461.0
          class  f1_score  precision  recall  support
1      positive      0.81       0.81     0.8   1023.0
3      positive      0.81       0.81     0.8   1023.0
5      positive      0.81       0.81

In [204]:
def printReportLiblinear(dataframe):
    liblinear = dataframe[1]
    for index, data in liblinear.groupby('class'):
        print(data)
        
def printReportLiblinear2(dataframe):
    liblinear = dataframe[2]
    for index, data in liblinear.groupby('class'):
        print(data)

In [175]:
def printReportLinear(dataframe):
    linear = dataframe[0]
    for index, data in linear.groupby('class'):
        print(data)

In [168]:
linear_negative

Unnamed: 0,class,f1_score,precision,recall,support


In [165]:
linear

Unnamed: 0,class,f1_score,precision,recall,support
0,negative,0.49,0.37,0.71,139.0
1,positive,0.58,0.78,0.47,307.0
2,negative,0.55,0.65,0.47,139.0
3,positive,0.83,0.79,0.88,307.0
4,negative,0.49,0.59,0.42,139.0
5,positive,0.81,0.77,0.87,307.0
6,negative,0.57,0.67,0.5,139.0
7,positive,0.84,0.8,0.89,307.0
8,negative,0.41,0.53,0.33,139.0
9,positive,0.8,0.74,0.87,307.0
