In [None]:
# uncomment if you are running from google colab
# !pip install sklearn_crfsuite
# !pip install emoji
# !pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip

In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

from pythainlp import word_tokenize
from tqdm import tqdm_notebook
from pythainlp.ulmfit import process_thai

#viz
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import nltk
import string
import re
import matplotlib

%matplotlib inline

In [None]:
# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz
file_id = ' ' # FILE_ID in Google Drive
downloaded = drive.CreateFile({'id': file_id})
#print('Downloaded content "{}"'.format(downloaded.GetContentString()))
downloaded.GetContentFile('all.xlsx')
import pandas as pd
df  = pd.read_excel('all.xlsx')
df

In [None]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('', '', text)
    return text

def remove_url(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub(r"http\S+", "", text)
    return text

def remove_n(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub(r"\n", " ", text)
    return text

#https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
df['text'] = df['text'].apply(lambda x: remove_punct(x))
df['text'] = df['text'].apply(lambda x: remove_url(x))
df['text'] = df['text'].apply(lambda x: remove_emoji(x))
df['text'] = df['text'].apply(lambda x: remove_n(x))
df.head(30)

In [None]:
all_df = df.rename(columns={'agreeOp':'col'})
all_df = all_df[['text','col']]
all_df.loc[all_df['col'] == 1, 'col'] = 0
all_df['col'] *= -1

In [None]:
all_df, test_df = train_test_split(all_df, test_size=0.15, random_state=42)
train_df, valid_df = train_test_split(all_df, test_size=0.15, random_state=1111)

undersam = 4

low = train_df[train_df.col==1]
neu = train_df[train_df.col==0][::undersam]
up = resample(low,
                      replace=True, # sample with replacement
                      n_samples=len(neu), # match number in majority class
                      random_state=1111) # reproducible results
upsampled = pd.concat([up, neu])
train_df = upsampled

In [None]:
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)  

y_train = train_df["col"]
y_valid = valid_df["col"]
y_test = test_df["col"]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(tokenizer=process_thai, ngram_range=(1,2), min_df=20, sublinear_tf=True)
tfidf_fit = tfidf.fit(all_df["text"])
text_train = tfidf_fit.transform(train_df["text"])
text_valid = tfidf_fit.transform(valid_df["text"])
text_test = tfidf_fit.transform(test_df["text"])
X_train = text_train.toarray()
X_valid = text_valid.toarray()
X_test = text_test.toarray()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
token = CountVectorizer(tokenizer=process_thai, ngram_range=(1,2), min_df=20)
token_fit = token.fit(all_df["text"])
text_train = token_fit.transform(train_df["text"])
text_valid = token_fit.transform(valid_df["text"])
text_test = token_fit.transform(test_df["text"])
X_train = text_train.toarray()
X_valid = text_valid.toarray()
X_test = text_test.toarray()

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(random_state=1111)
X_train,y_train = oversample.fit_resample(X_train,y_train)

In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# train model
rfc = BernoulliNB().fit(X_train, y_train)

# predict on test set
rfc_pred = rfc.predict(X_valid)

print(accuracy_score(y_valid, rfc_pred))
print(precision_score(y_valid, rfc_pred))
print(recall_score(y_valid, rfc_pred))
print(f1_score(y_valid, rfc_pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# train model
rfc = DecisionTreeClassifier(random_state=1111).fit(X_train, y_train)

# predict on test set
rfc_pred = rfc.predict(X_valid)

print(accuracy_score(y_valid, rfc_pred))
print(precision_score(y_valid, rfc_pred))
print(recall_score(y_valid, rfc_pred))
print(f1_score(y_valid, rfc_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# train model
rfc = RandomForestClassifier(random_state=1111).fit(X_train, y_train)

# predict on test set
rfc_pred = rfc.predict(X_valid)

print(accuracy_score(y_valid, rfc_pred))
print(precision_score(y_valid, rfc_pred))
print(recall_score(y_valid, rfc_pred))
print(f1_score(y_valid, rfc_pred))

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# train model
rfc = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(), random_state=1111).fit(X_train, y_train)

# predict on test set
rfc_pred = rfc.predict(X_valid)

print(accuracy_score(y_valid, rfc_pred))
print(precision_score(y_valid, rfc_pred))
print(recall_score(y_valid, rfc_pred))
print(f1_score(y_valid, rfc_pred))

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# train model
rfc = LogisticRegression(penalty="l2", solver="lbfgs", dual=False, multi_class="ovr", random_state=1111).fit(X_train, y_train)

# predict on test set
rfc_pred = rfc.predict(X_valid)

print(accuracy_score(y_valid, rfc_pred))
print(precision_score(y_valid, rfc_pred))
print(recall_score(y_valid, rfc_pred))
print(f1_score(y_valid, rfc_pred))

In [None]:
import joblib
filename = 'disAgreeOp.sav'
joblib.dump(rfc, filename)