In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from xgboost import XGBClassifier
from spacy.lang.es import Spanish
from spacy.lang.pt import Portuguese
from sklearn.metrics import balanced_accuracy_score
from spacy.tokenizer import Tokenizer
from sklearn.svm import LinearSVC
import dask.dataframe as dd
import pyarrow
from multiprocessing import  Pool
import numpy as np
import wordbatch

In [2]:
# Load Dataset
data_raw = pd.read_csv('../data/train.csv')

### Basic statistics

In [None]:
data_raw.dtypes

In [None]:
data_raw.describe()

In [None]:
data_raw.sample(5)

In [3]:
data_raw.label_quality.value_counts(normalize=True)

unreliable    0.940788
reliable      0.059212
Name: label_quality, dtype: float64

In [None]:
data_raw.language.value_counts()

In [None]:
data_raw.category.value_counts()

## Subsampling

In [5]:
category_pct = data_raw.category.value_counts(normalize=True)

In [9]:
category_pct

PANTS                                   0.001799
COFFEE_MAKERS                           0.001755
BABY_CAR_SEATS                          0.001708
MUSICAL_KEYBOARDS                       0.001661
MATTRESSES                              0.001648
                                          ...   
CONSTRUCTION_LIME_BAGS                  0.000010
COLD_FOOD_AND_DRINK_VENDING_MACHINES    0.000008
PAINTBALL_SMOKE_GRENADES                0.000008
COMMERCIAL_POPCORN_MACHINES             0.000007
HAMBURGER_FORMERS                       0.000005
Name: category, Length: 1588, dtype: float64

In [6]:
data_reliable = data_raw[data_raw["label_quality"]=='reliable']

In [18]:
reliable_category_pct = data_reliable.category.value_counts(normalize=True).round(6)

In [25]:
len(category_pct) - len(reliable_category_pct)

193

In [24]:
len(set(category_pct.index) - set(reliable_category_pct.index) )

193

In [36]:
base_data_dir = '../data/'
normalized_language_files = {'spanish':{'train': base_data_dir + 'train_spanish_norm.csv',
                                        'validation': base_data_dir + 'val_spanish_norm.csv',
                                        'test': base_data_dir + 'test_spanish_norm.csv'}, 
                             'portuguese': {'train': base_data_dir + 'train_portuguese_norm.csv',
                                            'validation': base_data_dir + 'val_portuguese_norm.csv',
                                            'test': base_data_dir + 'test_portuguese_norm.csv'},
                             'mapping':{'spanish': base_data_dir + 'language_mapping_spanish.csv',
                                         'portuguese': base_data_dir + 'language_mapping_portuguese.csv'}
                            }                     

In [38]:
normalized_language_files["spanish"]["train"]

'../data/train_spanish_norm.csv'

### Preprocessing

#### Tasks:
    1. Lowercase all words
    2. Tokenize
    3. Remove stop words
    4. Remove special characters

In [None]:
nlp_es = Spanish()
nlp_pt = Portuguese()

In [7]:
data = data_raw[0:100000].copy()
#data = data_raw.copy()

In [None]:
data.sample(20)

In [None]:
# 1. Lowercase
data['title'] = data['title'].str.lower()

In [None]:
# 2. Tokenize
# 3. Remove Stopwords & Punctuation

In [8]:
mask_spanish    = data["language"] == 'spanish'
mask_portuguese = data["language"] == 'portuguese'

In [None]:
data.loc[mask_spanish, "tokens"] = data["title"].apply(lambda x: ' '.join([tok.text for tok in nlp_es.tokenizer(x) if not (tok.is_punct or tok.is_stop)]))
data.loc[mask_portuguese, "tokens"] = data["title"].apply(lambda x: ' '.join([tok.text for tok in nlp_pt.tokenizer(x) if not (tok.is_punct or tok.is_stop)]))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data["tokens"], data["category"], test_size=0.10, random_state=42)

In [None]:
tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)
X_train_tfidf.shape

### Classifiers

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train_tfidf, y_train)

In [None]:
yTrainPredict = clf.predict(X_train_tfidf)
yPrediction = clf.predict(X_test_tfidf)
print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_train, yTrainPredict))
print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_test, yPrediction))

In [None]:
clf_svm = LinearSVC()

In [None]:
clf_svm.fit(X_train_tfidf, y_train)

In [None]:
#TrainPredict = clf_svm.predict(X_train_tfidf)
yPrediction = clf_svm.predict(X_test_tfidf)
#print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_train, yTrainPredict))
print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_test, yPrediction))

In [None]:
from datetime import datetime
# datetime object containing current date and time
now = datetime.now()
 
print("now =", now)
# dd/mm/YY H:M:S
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
print("date and time =", dt_string)	

In [None]:
from sklearn.externals import joblib
# Output a pickle file for the model
joblib.dump(clf_svm, 'svm_model1.pkl') 


## DASK

In [None]:
filename = '../data/train.csv'
df = dd.read_csv(filename, dtype='str')
df.to_parquet('../data/train.parquet', engine='pyarrow')

In [None]:
df = dd.read_parquet('../data/train.parquet', engine='pyarrow')
df

In [None]:
mask_spanish    = df["language"] == 'spanish'
mask_portuguese = df["language"] == 'portuguese'

In [None]:
df['title'] = df['title'].str.lower()

In [None]:
df.compute()

In [None]:
df[mask_spanish]["tokens"] = df["title"].apply(lambda x: ' '.join([tok.text for tok in nlp_es.tokenizer(x) if not (tok.is_punct or tok.is_stop)]))
df[mask_portuguese]["tokens"] = df["title"].apply(lambda x: ' '.join([tok.text for tok in nlp_pt.tokenizer(x) if not (tok.is_punct or tok.is_stop)]))

In [None]:
df.compute()

In [None]:
df.head(5)

In [None]:
df.index.values

In [None]:
!time python /home/franco_camporeale/mlchallenge/preprocess.py

## Parallelize

In [7]:
data = pd.read_csv('../data/train.csv')

In [8]:
nlp_es = Spanish()
nlp_pt = Portuguese()

In [9]:
def preprocess(df):
    mask_spanish    = df["language"] == 'spanish'
    mask_portuguese = df["language"] == 'portuguese'
    df['title'] = df['title'].str.lower()
    df.loc[mask_spanish, "tokens"] = df["title"].apply(lambda x: ' '.join([tok.text for tok in nlp_es.tokenizer(x) if 
                                                                          tok.is_alpha and not (tok.is_digit or tok.is_stop or len(tok.text) == 1)]))
    df.loc[mask_portuguese, "tokens"] = df["title"].apply(lambda x: ' '.join([tok.text for tok in nlp_pt.tokenizer(x) if
                                                                             tok.is_alpha and not (tok.is_digit or tok.is_stop or len(tok.text) == 1)]))
    return df

In [10]:
def parallelize_dataframe(df, func, n_cores=8):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [11]:
%time train = parallelize_dataframe(data, preprocess)

CPU times: user 35.2 s, sys: 17.3 s, total: 52.5 s
Wall time: 23min 39s


In [12]:
train.to_csv('../data/train_prep1.csv')

## Feature Engineering

Vamos a usar wordbatch para paralelizar:

https://medium.com/@d.canivel/wordbatch-a-parallel-text-feature-extraction-for-machine-learning-eb3696f40996

Y vamos a usar hashvectorizer:

https://www.researchgate.net/post/What_is_a_good_way_to_perform_topic_modeling_on_short_text


In [2]:
from wordbatch.pipelines import WordBatch
from wordbatch.extractors import WordHash

In [4]:
data = pd.read_csv('../data/train_prep1.csv')
data.drop(["title"],axis=1,inplace=True)
data.dropna(inplace=True)
data.to_csv('../train_prep.csv')

In [4]:
data.dropna(inplace=True)
X_train, X_test, y_train, y_test = train_test_split(data["tokens"], data["category"], test_size=0.10, random_state=42)

In [5]:
data.shape

(19999755, 6)

In [6]:
vect = CountVectorizer(ngram_range=(1,2),max_features=2298583)
%time X_train_vect = vect.fit_transform(X_train)
%time X_test_vect = vect.transform(X_test)
X_train_vect.shape

CPU times: user 11min 20s, sys: 15.8 s, total: 11min 36s
Wall time: 11min 34s
CPU times: user 51.3 s, sys: 205 ms, total: 51.5 s
Wall time: 51.5 s


(17999779, 2298583)

In [None]:
clf_svm = LinearSVC()
%time clf_svm.fit(X_train_vect, y_train)


In [None]:
#TrainPredict = clf_svm.predict(X_train_tfidf)
%time yPrediction = clf_svm.predict(X_test_vect)
#print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_train, yTrainPredict))
print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_test, yPrediction))

In [5]:
del data

In [6]:
!python train_svc.py

^C
