In [109]:
import sys
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split



In [125]:
# Training data
file_name = "trainingdata.txt"
with open(file_name, 'r') as file:
    data = file.read()
raw_training_data = data.split("\n")
# print(training_data)
# # Testing data
# data = sys.stdin.readlines()
# testing_data = [line.rstrip() for line in data]

testing_df = pd.read_csv('testing_data.txt')
testing_df.columns = ['document']
testing_df


Unnamed: 0,document
0,This is a document
1,this is another document
2,documents are seperated by newlines


In [179]:
# 1. Store it in the appropiate way -> panda data frame
n_observations = int(raw_training_data[0].rstrip())

pattern = r"^\d+"

training_list = []
for i in range(1,n_observations+1):
    input_str = raw_training_data[i]
    
    match = re.match(pattern, input_str)
    row = list([int(match.group()), input_str[match.end()+1:]])
    training_list.append(row)

colnames = ['label', 'document']
raw_training_df = pd.DataFrame(training_list, columns= colnames)
raw_training_df["label"].astype(int)

dataset_inspection(raw_training_df)

Missing values:  label       0
document    0
dtype: int64
Number of duplicated rows:  58
Percentage of duplicated rows:  label       0.010574
document    0.010574
dtype: float64
Label counts:
1    2840
2    1596
6     253
3     251
8     206
7     190
4     108
5      41
Name: label, dtype: int64
Label percentages:
1    51.777575
2    29.097539
6     4.612580
3     4.576117
8     3.755697
7     3.463993
4     1.969006
5     0.747493
Name: label, dtype: float64
Minimum document length: 30
Maximum document length: 5295
Average document length: 605.4659981768459
Median document length: 375.0
q1 document length: 165.0
q3 document length: 686.0


In [178]:
def dataset_inspection(df):
        # Missingness
        missing_values = df.isnull().sum()
        print('Missing values: ', missing_values)

        # Duplicated rows
        print('Number of duplicated rows: ', df.duplicated().sum())
        print('Percentage of duplicated rows: ', (df.duplicated().sum() / df.apply(len)))

        # Count and percentage per label
        label_counts = df['label'].value_counts()
        label_percentages = df['label'].value_counts(normalize=True) * 100
        print('Label counts:')
        print(label_counts)
        print('Label percentages:')
        print(label_percentages)

        # Evaluate distribution of predictors
        predictors = "document"
        df['document_length'] = df[predictors].apply(len)
        print('Minimum document length:', df['document_length'].min())
        print('Maximum document length:', df['document_length'].max())
        print('Average document length:', df['document_length'].mean())
        print('Median document length:', df['document_length'].median())
        print('q1 document length:', df['document_length'].quantile(0.25))
        print('q3 document length:', df['document_length'].quantile(0.75))

In [133]:
# Clearly, the data set is umbalanced, let's reabalance it

# Separate majority and minority classes
df_majority = raw_training_df[(raw_training_df['label'] == 1)|(raw_training_df['label'] == 2)]
df_minority = raw_training_df[(raw_training_df['label'] != 1) & (raw_training_df['label'] != 2)]

# OPTION 1 : DOWNSAMPLE MAJORITY CLASS
# Downsample majority class
df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   n_samples=len(df_minority),
                                   random_state=42)

# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])


# OPTION 2 : UPSAMPLE MINORITY CLASS
# Upsample minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority)*3,
                                 random_state=42)

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])


# Let's try first with upsampling
training_df = df_upsampled

In [1]:
from sklearn.utils import resample

def upsample_function(initial_df, minority_classes, target_size_per_class):
        initial_df = raw_training_df

        for c in minority_classes:
                df_majority = initial_df[raw_training_df['label'] != c]
                df_minority = initial_df[raw_training_df['label'] == c]

                # Upsample minority class
                df_minority_upsampled = resample(df_minority,
                                                replace=True,
                                                n_samples=round(target_size_per_class),
                                                random_state=42)

                # # Combine majority class with upsampled minority class
                df_upsampled = pd.concat([df_majority, df_minority_upsampled])
                initial_df = df_upsampled
        return df_upsampled

def downsample_function(initial_df, majority_classes, target_size_per_class):
        initial_df = raw_training_df

        for c in majority_classes:
                df_majority = initial_df[raw_training_df['label'] == c]
                df_minority = initial_df[raw_training_df['label'] != c]

                # Downsample majority class
                df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   n_samples=target_size_per_class,
                                   random_state=42)

                # Combine minority class with downsampled majority class
                df_downsampled = pd.concat([df_majority_downsampled, df_minority])
        return df_downsampled

In [164]:
minority_classes = [2,3,4,5,6,7,8]
n_classes = 8
target_size_per_class = 2500

training_df = upsample_function(raw_training_df, minority_classes, n_classes, target_size_per_class)

  df_majority = initial_df[raw_training_df['label'] != c]
  df_minority = initial_df[raw_training_df['label'] == c]
  df_majority = initial_df[raw_training_df['label'] != c]
  df_minority = initial_df[raw_training_df['label'] == c]
  df_majority = initial_df[raw_training_df['label'] != c]
  df_minority = initial_df[raw_training_df['label'] == c]
  df_majority = initial_df[raw_training_df['label'] != c]
  df_minority = initial_df[raw_training_df['label'] == c]
  df_majority = initial_df[raw_training_df['label'] != c]
  df_minority = initial_df[raw_training_df['label'] == c]
  df_majority = initial_df[raw_training_df['label'] != c]
  df_minority = initial_df[raw_training_df['label'] == c]


In [175]:
# 2 Build-up of the text classifier ##### 

vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()

# classifier = MultinomialNB()
classifier = LinearSVC(class_weight='balanced')

# Split in test and training data 
X = vectorizer.fit_transform(training_df['document'])
y = training_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Train the model in training data
classifier.fit(X_train, y_train)

# Test in testing data AND obtaining classification_report
y_pred = classifier.predict(X_test)
report = classification_report(y_test, y_pred, digits=4)
print(report)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

              precision    recall  f1-score   support

           1     0.9918    0.9794    0.9856      1118
           2     0.9792    0.9900    0.9846      1001
           3     0.9990    1.0000    0.9995      1025
           4     1.0000    1.0000    1.0000      1002
           5     1.0000    1.0000    1.0000       958
           6     0.9990    1.0000    0.9995      1025
           7     0.9783    0.9930    0.9856      1001
           8     0.9929    0.9791    0.9860      1006

    accuracy                         0.9925      8136
   macro avg     0.9926    0.9927    0.9926      8136
weighted avg     0.9925    0.9925    0.9925      8136

[[1095   21    1    0    0    0    1    0]
 [   9  991    0    0    0    1    0    0]
 [   0    0 1025    0    0    0    0    0]
 [   0    0    0 1002    0    0    0    0]
 [   0    0    0    0  958    0    0    0]
 [   0    0    0    0    0 1025    0    0]
 [   0    0    0    0    0    0  994    7]
 [   0    0    0    0    0    0   21  985]]


In [176]:
X_eval = vectorizer.transform(testing_df["document"])
X_eval

<3x19162 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [177]:
# 3. Prediction on validation data data
X_eval = vectorizer.transform(testing_df["document"])
y_eval = classifier.predict(X_eval)

for y in y_eval:
    print(y)

# # # Let's see what accuracy we got
# # y_train_pred = classifier.predict(X_train)
# # train_accuracy = accuracy_score(y_train, y_train_pred)
# # print("Training accuracy:", train_accuracy)

2
2
2


In [160]:
# It seems LinearSVC is not working greatly --> ensemble methods

vectorizer = CountVectorizer()

# classifier = MultinomialNB()
# classifier = LinearSVC(class_weight='balanced')
classifier = RandomForestClassifier(random_state=123)

# Split in test and training data 
X = vectorizer.fit_transform(training_df['document'])
y = training_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Train the model in training data
classifier.fit(X_train, y_train)

# Test in testing data AND obtaining classification_report
y_pred = classifier.predict(X_test)
report = classification_report(y_test, y_pred, digits=4)
print(report)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)


              precision    recall  f1-score   support

           1     0.9943    0.9560    0.9747      1090
           2     0.9330    0.9846    0.9581       651
           3     0.9980    1.0000    0.9990      1021
           4     1.0000    1.0000    1.0000      1027
           5     1.0000    1.0000    1.0000       991
           6     0.9960    1.0000    0.9980       991
           7     0.9902    0.9951    0.9927      1019
           8     0.9949    0.9898    0.9924       985

    accuracy                         0.9906      7775
   macro avg     0.9883    0.9907    0.9894      7775
weighted avg     0.9909    0.9906    0.9906      7775

[[1042   46    1    0    0    1    0    0]
 [   6  641    1    0    0    3    0    0]
 [   0    0 1021    0    0    0    0    0]
 [   0    0    0 1027    0    0    0    0]
 [   0    0    0    0  991    0    0    0]
 [   0    0    0    0    0  991    0    0]
 [   0    0    0    0    0    0 1014    5]
 [   0    0    0    0    0    0   10  975]]


Had I had more time, I would have liked to evaluate:
- Other balancing strategies
- Other classifier systems
