# Language Detection Model

## Setup

In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import os
import string

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.model_selection import cross_validate, StratifiedKFold

import warnings
warnings.filterwarnings("ignore")

In [2]:
# load data for language detection
df1 = pd.read_csv('data/LanguageDetection.csv')
df1.columns = ['text', 'target']
df1.head()

Unnamed: 0,text,target
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [3]:
# load data for offensive english classification
df2 = pd.read_csv('data/olid-training-v1.0.csv')
df2 = df2[['tweet', 'subtask_a']]
df2.columns = ['text', 'target']
df2.head()

Unnamed: 0,text,target
0,@USER She should ask a few native Americans wh...,OFF
1,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF
2,Amazon is investigating Chinese employees who ...,NOT
3,"@USER Someone should'veTaken"" this piece of sh...",OFF
4,@USER @USER Obama wanted liberals &amp; illega...,NOT


In [4]:
def preprocess(text, rm_emoji=False):
    
    text = text.replace('@USER', '') # remove mentions (@USER)
    text = text.replace('URL', '') # remove URLs
    text = text.replace('&amp', 'and') # replace ampersand (&) with and
    text = text.replace('&lt','') # remove &lt
    text = text.replace('&gt','') # remove &gt
    text = text.replace('\d+','') # remove numbers
    text = text.lower() # lowercase

    # remove punctuation
    for p in string.punctuation:
        text = text.replace(p, '')

    # remove emojis
    if rm_emoji:
        text = text.encode('ascii', 'ignore').decode('ascii')
    
    text = text.strip() # leading and trailing whitespaces

    return text

In [5]:
le1 = LabelEncoder()
classes1 = [
    'Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
    'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
    'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish']

df1['y'] = le1.fit_transform(df1['target'])
df1['text'] = df1['text'].astype(str).apply(preprocess)

In [6]:
le2 = LabelEncoder()
classes2 = ['Offensive', 'Not offensive']

df2['y'] = le2.fit_transform(df2['target'])
df2['text'] = df2['text'].astype(str).apply(preprocess, rm_emoji=True)

## Prediciton pipeline

In [7]:
# setup model pipeline and cv
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
# language detection model
ld_model = Pipeline([
    ('vect', CountVectorizer(strip_accents='unicode', lowercase=True)),
    ('tfidf', TfidfTransformer()),
    ('svd', TruncatedSVD(n_components=40)),
    ('clf', HistGradientBoostingClassifier(learning_rate=0.05, random_state=42))
])

# obtain cv score
print('Validating language detection model...')
cv_results1 = cross_validate(ld_model, df1['text'], df1['y'], cv = cv, scoring = 'accuracy', verbose=0, n_jobs=-1)
m1, s1 = np.mean(cv_results1['test_score']), np.std(cv_results1['test_score'])
print(f'CV accuracy: {m1:.4f} ({s1:.4f})')

# train final models
print('Fitting final language detection model...')
ld_model = ld_model.fit(df1['text'], df1['y'])

Validating language detection model...
CV accuracy: 0.9672 (0.0041)
Fitting final language detection model...


In [9]:
# offensive english classification model
oe_model = Pipeline([
    ('vect', CountVectorizer(strip_accents='unicode', lowercase=True)),
    ('tfidf', TfidfTransformer()),
    ('svd', TruncatedSVD(n_components=40)),
    ('clf', HistGradientBoostingClassifier(learning_rate=0.05, random_state=42))
])

# obtain cv score
print('Validating language detection model...')
cv_results2 = cross_validate(oe_model, df2['text'], df2['y'], cv = cv, scoring = 'accuracy', verbose=0, n_jobs=-1)
m2, s2 = np.mean(cv_results2['test_score']), np.std(cv_results2['test_score'])
print(f'CV accuracy: {m2:.4f} ({s2:.4f})')

# train final models
print('Fitting final offensive english classification model...')
oe_model = oe_model.fit(df2['text'], df2['y'])

Validating language detection model...
CV accuracy: 0.6934 (0.0029)
Fitting final offensive english classification model...


In [10]:
# save final models
out_path = 'app/model/'
os.makedirs(out_path, exist_ok=True)

# language detection
with open(out_path + 'ld_model.pkl', 'wb') as f:
    pickle.dump(ld_model, f)

# offensive english
with open(out_path + 'oe_model.pkl', 'wb') as f:
    pickle.dump(oe_model, f)