In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from spacy.lang.es import Spanish
from spacy.lang.pt import Portuguese
from sklearn.metrics import balanced_accuracy_score
from spacy.tokenizer import Tokenizer
from sklearn.svm import LinearSVC
import dask.dataframe as dd
import pyarrow
from multiprocessing import  Pool
import numpy as np

In [None]:
# Load Dataset
data = pd.read_csv('../data/train.csv')

### Basic statistics

In [None]:
data_raw.dtypes

In [None]:
data_raw.describe()

In [None]:
data_raw.sample(5)

In [None]:
data_raw.label_quality.value_counts()

In [None]:
data_raw.language.value_counts()

In [None]:
data_raw.category.value_counts()

### Preprocessing

#### Tasks:
    1. Lowercase all words
    2. Tokenize
    3. Remove stop words
    4. Remove special characters

In [None]:
nlp_es = Spanish()
nlp_pt = Portuguese()

In [None]:
#data = data_raw[0:100000].copy()
data = data_raw.copy()

In [None]:
data.sample(20)

In [None]:
# 1. Lowercase
data['title'] = data['title'].str.lower()

In [None]:
# 2. Tokenize
# 3. Remove Stopwords & Punctuation

In [None]:
mask_spanish    = data["language"] == 'spanish'
mask_portuguese = data["language"] == 'portuguese'

In [None]:
data.loc[mask_spanish, "tokens"] = data["title"].apply(lambda x: ' '.join([tok.text for tok in nlp_es.tokenizer(x) if not (tok.is_punct or tok.is_stop)]))
data.loc[mask_portuguese, "tokens"] = data["title"].apply(lambda x: ' '.join([tok.text for tok in nlp_pt.tokenizer(x) if not (tok.is_punct or tok.is_stop)]))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data["tokens"], data["category"], test_size=0.10, random_state=42)

In [None]:
tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)
X_train_tfidf.shape

### Classifiers

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train_tfidf, y_train)

In [None]:
yTrainPredict = clf.predict(X_train_tfidf)
yPrediction = clf.predict(X_test_tfidf)
print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_train, yTrainPredict))
print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_test, yPrediction))

In [None]:
clf_svm = LinearSVC()

In [None]:
clf_svm.fit(X_train_tfidf, y_train)

In [None]:
#TrainPredict = clf_svm.predict(X_train_tfidf)
yPrediction = clf_svm.predict(X_test_tfidf)
#print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_train, yTrainPredict))
print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_test, yPrediction))

In [None]:
from datetime import datetime
# datetime object containing current date and time
now = datetime.now()
 
print("now =", now)
# dd/mm/YY H:M:S
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
print("date and time =", dt_string)	

In [None]:
from sklearn.externals import joblib
# Output a pickle file for the model
joblib.dump(clf_svm, 'svm_model1.pkl') 


## DASK

In [None]:
filename = '../data/train.csv'
df = dd.read_csv(filename, dtype='str')
df.to_parquet('../data/train.parquet', engine='pyarrow')

In [None]:
df = dd.read_parquet('../data/train.parquet', engine='pyarrow')
df

In [None]:
mask_spanish    = df["language"] == 'spanish'
mask_portuguese = df["language"] == 'portuguese'

In [None]:
df['title'] = df['title'].str.lower()

In [None]:
df.compute()

In [None]:
df[mask_spanish]["tokens"] = df["title"].apply(lambda x: ' '.join([tok.text for tok in nlp_es.tokenizer(x) if not (tok.is_punct or tok.is_stop)]))
df[mask_portuguese]["tokens"] = df["title"].apply(lambda x: ' '.join([tok.text for tok in nlp_pt.tokenizer(x) if not (tok.is_punct or tok.is_stop)]))

In [None]:
df.compute()

In [None]:
df.head(5)

In [None]:
df.index.values

In [None]:
!time python /home/franco_camporeale/mlchallenge/preprocess.py

## Parallelize

In [6]:
from multiprocessing import  Pool
import numpy as np

In [7]:
data = pd.read_csv('../data/train.csv')

In [8]:
nlp_es = Spanish()
nlp_pt = Portuguese()

In [9]:
def preprocess(df):
    mask_spanish    = df["language"] == 'spanish'
    mask_portuguese = df["language"] == 'portuguese'
    df['title'] = df['title'].str.lower()
    df.loc[mask_spanish, "tokens"] = df["title"].apply(lambda x: ' '.join([tok.text for tok in nlp_es.tokenizer(x) if not (tok.is_punct or tok.is_stop)]))
    df.loc[mask_portuguese, "tokens"] = df["title"].apply(lambda x: ' '.join([tok.text for tok in nlp_pt.tokenizer(x) if not (tok.is_punct or tok.is_stop)]))
    return df

In [10]:
def parallelize_dataframe(df, func, n_cores=8):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [11]:
%time train = parallelize_dataframe(data, preprocess)

CPU times: user 35.2 s, sys: 17.3 s, total: 52.5 s
Wall time: 23min 39s


In [12]:
train.to_csv('../data/train_prep1.csv')

In [13]:
https://medium.com/@d.canivel/wordbatch-a-parallel-text-feature-extraction-for-machine-learning-eb3696f40996
http://physics.muni.cz/~vazny/04%20-%20Large%20Scale%20Text%20Classification%20for%20Sentiment%20Analysis.html

SyntaxError: invalid syntax (<ipython-input-13-7e63716e901f>, line 1)