In [16]:
import numpy as np
import pandas as pd
from nltk import tokenize, stem, corpus, download
import regex as re


In [17]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()


df = pd.read_csv("Train_zsoft.csv")
df2 = pd.read_csv("Test_zsoft.csv")
Y = encoder.fit_transform(list(df["label"]))

In [18]:
X_raw = list(df["text"]) + list(df2["text"])

In [19]:
def preprocess(content):

    content = content.lower()
    dateReg = r"((\d{4}.+?\d{4})|\b\d{4}\b)"
    moneyReg = r"(\d+[\., ]*)+(k|K|m|M)?($|£|€)"#(\d+\W*)+(k|K|m|M)?($|£|€)

    content = re.sub(moneyReg, " dollar ", content)
    content = re.sub(dateReg, " span ", content)

    tokenizer = tokenize.TreebankWordTokenizer()
    
    stopwords = corpus.stopwords.words('french') 
    stemmer = stem.SnowballStemmer("french")

    tokens = tokenizer.tokenize(content)

    

    return [stemmer.stem(token) for token in tokens if token not in stopwords and token.isalpha()]

In [20]:
initial_vocab = []
dataset = []
X_Kaggle = []
for i in range(len(X_raw)):
    temp = preprocess(X_raw[i])
    initial_vocab = initial_vocab + temp

    if i < len(Y):
        dataset = dataset + [temp]

In [21]:
from collections import Counter
frequencies = Counter(initial_vocab)
final_vocab = []
k = 1
for key,value in frequencies.items():
    if value >= k:
        final_vocab.append(key)


token_to_index = dict()
i = 0 
for i in range(len(final_vocab)):
    token_to_index[final_vocab[i]] = i 

In [22]:
def transform(tokens, final_vocab):
    transformed = [token_to_index[token] for token in tokens if token in final_vocab]
    return transformed

def counterFeatures(tokens):
    indices = transform(tokens, final_vocab)
    X = np.zeros((1,len(final_vocab)))

    for i in indices:
        X[0][i] += 1 

    return X 

In [23]:
n = len(final_vocab)
m = len(df)
X_counter_full = np.zeros((m, n), dtype=np.uint16)

for i in range(m) :
    X_counter_full[i][:] = counterFeatures(dataset[i]) 

X_binary = np.copy(X_counter_full)
X_binary[X_binary > 1] = 1

In [25]:
from sklearn.model_selection import train_test_split

X_counter_train, X_counter_test, Y_counter_train, Y_counter_test = train_test_split(X_counter_full, Y, test_size=0.2) 
X_binary_train, X_binary_test, Y_binary_train, Y_binary_test = train_test_split(X_binary, Y, test_size=0.2) 

In [27]:
from sklearn.naive_bayes import MultinomialNB
bayes_counter = MultinomialNB()
bayes_counter.fit(X_binary, Y)

MultinomialNB()

In [28]:
print("Training Accuracy:",(bayes_counter.score(X_binary, Y))*100,"%")
#print("Testing Accuracy:",(bayes_counter.score(X_counter_test, Y_counter_test))*100,"%")

Training Accuracy: 81.22075077168176 %


In [37]:
from sklearn.naive_bayes import MultinomialNB
bayes_binary = MultinomialNB()
bayes_binary.fit(X_binary, Y)

MultinomialNB()

In [38]:
print("Training Accuracy:",(bayes_binary.score(X_binary, Y))*100,"%")
#print("Testing Accuracy:",(bayes_binary.score(X_binary_test, Y_binary_test))*100,"%")

Training Accuracy: 81.15768860566232 %


In [39]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
bernouli_binary = BernoulliNB()
bernouli_binary.fit(X_binary, Y)

BernoulliNB()

In [40]:
print("Training Accuracy:",(bernouli_binary.score(X_binary, Y))*100,"%")
#print("Testing Accuracy:",(bayes_binary.score(X_binary_test, Y_binary_test))*100,"%")

Training Accuracy: 80.78595373228451 %


In [17]:
from sklearn.linear_model import LogisticRegression 
regBinary = LogisticRegression(max_iter=1000, penalty="l2", random_state=123)
regBinary.fit(X_binary, Y)

LogisticRegression(max_iter=1000, random_state=123)

In [18]:
print("Training Accuracy:",(regBinary.score(X_binary, Y))*100,"%")
#print("Testing Accuracy:",(regBinary.score(X_binary_test, Y_binary_test))*100,"%")

: 

: 

In [14]:
from sklearn.svm import SVC, LinearSVC

svc = LinearSVC(C=0.1 ,verbose=1, random_state=10, max_iter=10000)
svc.fit(X_binary, Y)

[LibLinear]..*
optimization finished, #iter = 25
Objective value = -857.646447
nSV = 19881
..*
optimization finished, #iter = 27
Objective value = -1556.399866
nSV = 24506
..*
optimization finished, #iter = 28
Objective value = -1206.137656
nSV = 20360


LinearSVC(C=0.1, max_iter=10000, random_state=10, verbose=1)

In [15]:
print("Training Accuracy:",(svc.score(X_binary, Y))*100,"%")


Training Accuracy: 83.36154535497361 %


In [16]:
temp = list(df2["text"])
X_Kaggle = np.zeros((len(temp), len(final_vocab)), dtype=np.uint16)
for i in range(len(temp)) :
    x = preprocess(temp[i])
    X_Kaggle[i][:] = counterFeatures(x) 

prediction = svc.predict(X_Kaggle)
dfKaggle = df2.copy()
dfKaggle["label"] = encoder.inverse_transform(prediction)
dfKaggle.drop("text", inplace=True, axis=1)
dfKaggle.to_csv("prediction.csv", index=False)