In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from multiprocessing import  Pool
import numpy as np
import fasttext
import csv
import unicodedata
from collections import defaultdict
from joblib import Parallel, delayed
%load_ext autoreload
%autoreload 2
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import utils
from utils import models_for_predict, normalized_files


In [None]:
## Resolviendo falta de language mapping

In [16]:
train = pd.read_csv("../data/train.csv")
val_full_norm = pd.read_csv("../data/val_full_norm.csv",header=None,names=['title'])

In [17]:
val_full_norm['title'] = val_full_norm['title'].apply(lambda x: ' '.join(x.split()[1:]))

In [18]:
X_train, X_val, y_train, y_val = train_test_split(train["language"], train["category"], test_size=0.075, random_state=42, stratify=train["category"])

In [None]:
pd.concat([val_full_norm["title"], pd.Series(X_val.values)], axis=1, ignore_index=True).sample(50)

In [19]:
X_val.to_csv("../data/val_language_mapping.csv", header=False, index=False)

In [None]:
## Fin resolviendo falta de language mapping

## Evaluando resultados en validation set

In [11]:
def run_model_on_val(model_file):
    val_data = pd.read_csv(normalized_files["validation"],header=None,names=['title'])
    val_data['title'] = val_data['title'].apply(lambda x: ' '.join(x.split()[1:]))
    
    print("Loading model file ", model_file, '...')
    if isinstance(model_file, dict):
        # Here we predict combining models for each language
        predictions = []
        val_data = pd.read_csv('../data/val_full_norm.csv',header=None,names=['title'])
        language_mapping = pd.read_csv('../data/val_language_mapping.csv',names=["language"])
        val_language = pd.concat([val_data,language_mapping['language']],axis=1)
        model_sp = fasttext.load_model(model_file["spanish"])
        model_pt = fasttext.load_model(model_file["portuguese"])
        print("Running predict on test set...")
        for index, row in val_language.iterrows():
            if row["language"] == 'spanish':
                category = model_sp.predict(row["title"])[0][0]
            if row["language"] == 'portuguese':
                category = model_pt.predict(row["title"])[0][0]
            predictions.append(category[9:])
        print("Predict finished for model bilingual")
        result = pd.Series(predictions, name="bilingual")
    else:            
        model = fasttext.load_model(model_file)
        print("Running predict on test set...")
        predictions = model.predict(val_data["title"].values.tolist())
        print("Predict finished for model ", model_file)
        result = pd.Series([x[0][9:] for x in predictions[0]], name=model_file[10:])
    return result

def count_votes(results_df):
    voted_results = {"id": [], "category": []}
    for index, row in results_df.iloc[:,1:].iterrows():
        voted_results["id"].append(index)
        voted_results["category"].append(row.value_counts().index[0])

    voted_results_df = pd.DataFrame.from_dict(voted_results)
    return voted_results_df

def parallel_val_predict():
    results = Parallel(n_jobs=8)(delayed(run_model_on_val)(model) for name, model in models_for_predict.items())
    base_classifiers_results = pd.concat([x for x in results], axis=1)
    return base_classifiers_results



In [12]:
%time base_classifiers_results = parallel_val_predict()

CPU times: user 2.33 s, sys: 1.2 s, total: 3.54 s
Wall time: 27min 56s


In [13]:
base_classifiers_results

Unnamed: 0,co_camporeale/models/model_full_300.bin,co_camporeale/models/model_full_100.bin,odels/model_reliable_2gram,odels/model_reliable_3gram,bilingual,bilingual.1,bilingual.2
0,CAR_SCANNERS,CAR_SCANNERS,CAR_SCANNERS,CAR_SCANNERS,CAR_SCANNERS,CAR_SCANNERS,CAR_SCANNERS
1,NOTEBOOK_CASES,NOTEBOOK_CASES,COMICS,NOTEBOOK_CASES,CELLPHONE_COVERS,CELLPHONE_COVERS,CELLPHONE_COVERS
2,KEY_RACKS,KEY_RACKS,KEYCHAINS,KEY_RACKS,KEY_RACKS,KEY_RACKS,KEY_RACKS
3,AUTOMOTIVE_EMBLEMS,AUTOMOTIVE_EMBLEMS,AUTOMOTIVE_TRUNK_LIDS,AUTOMOTIVE_TRUNK_LIDS,AUTOMOTIVE_EMBLEMS,AUTOMOTIVE_EMBLEMS,AUTOMOTIVE_EMBLEMS
4,WIRELESS_FM_TRANSMITTERS,WIRELESS_FM_TRANSMITTERS,WIRELESS_FM_TRANSMITTERS,WIRELESS_FM_TRANSMITTERS,WIRELESS_FM_TRANSMITTERS,WIRELESS_FM_TRANSMITTERS,WIRELESS_FM_TRANSMITTERS
...,...,...,...,...,...,...,...
1499995,AUTOMOTIVE_EMBLEMS,BEDLINERS,BUMPER_IMPACT_ABSORBERS,BUMPER_IMPACT_ABSORBERS,AUTOMOTIVE_EMBLEMS,AUTOMOTIVE_EMBLEMS,AUTOMOTIVE_EMBLEMS
1499996,SOUND_CONSOLES_AND_DJ_MIXERS,SOUND_CONSOLES_AND_DJ_MIXERS,SOUND_CONSOLES_AND_DJ_MIXERS,SOUND_CONSOLES_AND_DJ_MIXERS,SOUND_CONSOLES_AND_DJ_MIXERS,SOUND_CONSOLES_AND_DJ_MIXERS,SOUND_CONSOLES_AND_DJ_MIXERS
1499997,TOOTHPASTES,TOOTHPASTES,TOOTHPASTES,TOOTHPASTES,TOOTHPASTES,TOOTHPASTES,TOOTHPASTES
1499998,EYESHADOWS,EYESHADOWS,EYESHADOWS,EYESHADOWS,EYESHADOWS,EYESHADOWS,EYESHADOWS


## Entrenando

In [14]:
base_classifiers_results.columns = ["model_full_300", "model_full_100", "model_reliable_2gram", "model_reliable_3gram", "bilingual1"
                                   , "bilingual2", "bilingual3"]

In [19]:
full_results = pd.concat([base_classifiers_results,pd.Series(y_val.values)], axis=1)

In [21]:
X_train, X_val, y_train, y_val = train_test_split(full_results[['model_full_300','model_full_100','model_reliable_2gram','model_reliable_3gram',
                                                                'bilingual1','bilingual2','bilingual3']]
                                                  ,full_results[0], test_size=0.25, random_state=42, stratify=full_results[0])

In [None]:
y_train

In [106]:
X_train["ft_predict"] = X_train["model_full_300"] + ' ' + X_train["model_full_100"] + ' ' + X_train["bilingual1"] + ' ' + X_train["bilingual2"] +' '+ X_train["bilingual3"] 

In [107]:
ft_ensemble_train = pd.concat([X_train["ft_predict"], y_train],axis=1)

In [108]:
ft_ensemble_train[0] = ft_ensemble_train[0].apply(lambda x: '__label__'+ x)

In [None]:
ft_ensemble_train

In [110]:
ft_ensemble_train["title"] = ft_ensemble_train[0] + ' ' + ft_ensemble_train["ft_predict"]

In [111]:
ft_ensemble_train["title"].to_csv("../data/ensemble/val_train.csv",index=False, header=False) 

In [112]:
X_val["ft_predict"] = X_val["model_full_300"] + ' ' + X_val["model_full_100"]  + ' ' + X_val["bilingual1"] + ' ' + X_val["bilingual2"] +' '+ X_val["bilingual3"] 

In [113]:
ft_ensemble_test = pd.concat([X_val["ft_predict"], y_val],axis=1)

In [114]:
ft_ensemble_test[0] = ft_ensemble_test[0].apply(lambda x: '__label__'+ x)

In [115]:
ft_ensemble_test["title"] = ft_ensemble_test[0] + ' ' + ft_ensemble_test["ft_predict"]

In [116]:
ft_ensemble_test["title"].to_csv("../data/ensemble/val_test.csv",index=False, header=False) 

In [None]:
ft_ensemble_test

In [None]:
%time model = fasttext.train_supervised(input=ft_ensemble_train["title"].values.tolist(), lr=0.5, wordNgrams=3, thread=8)

In [51]:
model = fasttext.load_model("../models/ensemble/stacked_ft1.bin")




In [113]:
test_data = pd.read_csv('../data/test_full_norm.csv',names=['tokens'])

In [114]:
%time predictions = model.predict(test_data["tokens"].values.tolist())

CPU times: user 1min 8s, sys: 458 ms, total: 1min 9s
Wall time: 1min 17s


In [115]:
submission = pd.Series([x[0][9:] for x in predictions[0]])

In [None]:
submission

In [None]:
submission.to_csv("./submissions/submission_test.csv",header=["category"],index_label="id")

In [None]:
yPrediction = clf.predict(X_val_vect)
print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_val, yPrediction))

In [26]:
yPrediction = clf.predict(X_val_vect)
print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_val, yPrediction))

Balanced Accuracy Score: 0.50


In [None]:
%time yTrainPredict = clf.predict(X_train_vect)
%time yPrediction = clf.predict(X_test_vect)


In [None]:
print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_train, yTrainPredict))
print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_val, yPrediction))

## TEST!!!!

In [38]:
def run_model_test(model):
    test_data = pd.read_csv('../data/test_full_norm.csv',header=None,names=['title'])
    test_data['title'] = test_data['title'].apply(lambda x: ' '.join(x.split()))
    test_data.head(10).to_csv('rv.csv')
    print("Loading model file ", model, '...')
    if isinstance(model, dict):
        # Here we predict combining models for each language
        predictions = []
        language_mapping = pd.read_csv('../data/language_mapping_test.csv',names=["language"])
        test_language = pd.concat([test_data,language_mapping['language']],axis=1)
        model_sp = fasttext.load_model(model["spanish"])
        model_pt = fasttext.load_model(model["portuguese"])
        print("Running predict on test set...")
        for index, row in test_language.iterrows():
            if row["language"] == 'spanish':
                category = model_sp.predict(row["title"])[0][0]
            if row["language"] == 'portuguese':
                category = model_pt.predict(row["title"])[0][0]
            predictions.append(category[9:])
        print("Predict finished for model ", model)
        return pd.Series(predictions)

    else:            
        model = fasttext.load_model(model)
        print("Running predict on test set...")
        predictions = model.predict(test_data["title"].values.tolist())
        print("Predict finished for model ", model)
        return pd.Series([x[0][9:] for x in predictions[0]])

            
def parallel_models_get_test_results(model_files, n_cores=8):
    results = Parallel(n_jobs=n_cores)(delayed(run_model_test)(model) for name, model in model_files.items())
    return results

In [39]:
models_for_predict = {
               "model_full_300":"/home/franco_camporeale/models/model_full_300.bin",
               "model_full_100":"/home/franco_camporeale/models/model_full_100.bin",
               "model_reliable_2gram":"/backups/models/model_reliable_2gram",
               "model_reliable_3gram":"/backups/models/model_reliable_3gram",
               "model_bilingual_3gram": {"spanish":"/backups/models/model_spanish_3gram", 
                                    "portuguese":"/backups/models/model_portuguese_3gram"} ,
               "model_bilingual_2gram_100": {"spanish":"/home/franco_camporeale/models/model_spanish_100.bin", 
                                   "portuguese":"/home/franco_camporeale/models/model_portuguese100.bin"},
               "model_bilingual_2gram_300": {"spanish":"/home/franco_camporeale/models/model_spanish_300.bin", 
                                   "portuguese":"/home/franco_camporeale/models/model_portuguese_300.bin"},
              }

%time results = parallel_models_get_test_results(models_for_predict, n_cores=8)

CPU times: user 351 ms, sys: 1.56 s, total: 1.91 s
Wall time: 5min 42s


In [41]:
base_classifiers_results = pd.concat([x for x in results], axis=1)

In [121]:
base_classifiers_results["title"] = base_classifiers_results[0] + ' ' + base_classifiers_results[1] + ' ' + base_classifiers_results[4] + ' '+  base_classifiers_results[5] + ' '+  base_classifiers_results[6] 

In [122]:
base_classifiers_results

Unnamed: 0,0,1,2,3,4,5,6,title
0,DIAPER_BAGS,DIAPER_BAGS,DIAPER_BAGS,DIAPER_BAGS,DIAPER_BAGS,DIAPER_BAGS,DIAPER_BAGS,DIAPER_BAGS DIAPER_BAGS DIAPER_BAGS DIAPER_BAG...
1,BABY_GROOMING_KITS,BABY_CHANGING_PADS,BABY_CHANGING_PADS,BABY_CHANGING_PADS,BABY_CHANGING_PADS,BABY_CHANGING_PADS,BABY_CHANGING_PADS,BABY_GROOMING_KITS BABY_CHANGING_PADS BABY_CHA...
2,ENGINE_COOLING_FAN_MOTORS,ENGINE_COOLING_FAN_MOTORS,ENGINE_COOLING_FAN_MOTORS,ENGINE_COOLING_FAN_MOTORS,ENGINE_COOLING_FAN_MOTORS,ENGINE_COOLING_FAN_MOTORS,ENGINE_COOLING_FAN_MOTORS,ENGINE_COOLING_FAN_MOTORS ENGINE_COOLING_FAN_M...
3,AUTOMOTIVE_SHOCK_ABSORBER_BUMP_STOPS,AUTOMOTIVE_SHOCK_ABSORBER_BUMP_STOPS,AUTOMOTIVE_SHOCK_ABSORBER_BUMP_STOPS,AUTOMOTIVE_SHOCK_ABSORBER_BUMP_STOPS,AUTOMOTIVE_SHOCK_ABSORBER_BUMP_STOPS,AUTOMOTIVE_SHOCK_ABSORBER_BUMP_STOPS,AUTOMOTIVE_SHOCK_ABSORBER_BUMP_STOPS,AUTOMOTIVE_SHOCK_ABSORBER_BUMP_STOPS AUTOMOTIV...
4,BABY_CAR_SEATS,BABY_CAR_SEATS,BABY_CAR_SEATS,BABY_CAR_SEATS,BABY_CAR_SEATS,BABY_CAR_SEATS,BABY_CAR_SEATS,BABY_CAR_SEATS BABY_CAR_SEATS BABY_CAR_SEATS B...
...,...,...,...,...,...,...,...,...
246950,VEHICLE_BRAKE_DISCS,VEHICLE_BRAKE_DISCS,VEHICLE_BRAKE_DISCS,VEHICLE_BRAKE_DISCS,VEHICLE_BRAKE_DISCS,VEHICLE_BRAKE_DISCS,VEHICLE_BRAKE_DISCS,VEHICLE_BRAKE_DISCS VEHICLE_BRAKE_DISCS VEHICL...
246951,WALKIE_TALKIES,WALKIE_TALKIES,WALKIE_TALKIES,WALKIE_TALKIES,WALKIE_TALKIES,WALKIE_TALKIES,WALKIE_TALKIES,WALKIE_TALKIES WALKIE_TALKIES WALKIE_TALKIES W...
246952,CALCULATORS,CALCULATORS,CALCULATORS,CALCULATORS,CALCULATORS,CALCULATORS,CALCULATORS,CALCULATORS CALCULATORS CALCULATORS CALCULATOR...
246953,DINING_TABLES,DINING_TABLES,DINING_SETS,DINING_SETS,DINING_SETS,DINING_TABLES,DINING_SETS,DINING_TABLES DINING_TABLES DINING_SETS DINING...


In [99]:
base_classifiers_results["title"].to_csv("../data/test_predictions_for_ft.csv")

  """Entry point for launching an IPython kernel.


In [126]:
base_classifiers_results["title"].head(1).values

array(['DIAPER_BAGS DIAPER_BAGS DIAPER_BAGS DIAPER_BAGS DIAPER_BAGS'],
      dtype=object)

In [125]:
model = fasttext.load_model("../models/ensemble/stacked_ft3.bin")




In [None]:
%time predictions = model.predict(base_classifiers_results["title"].values.tolist())

In [None]:
submission = pd.Series([x[0][9:] for x in predictions[0]])

In [None]:
submission

In [104]:
submission.to_csv("./submissions/submission_ensemble_11.csv",header=["category"],index_label="id")

OVERFITTING ZARPADO

## Corrigiendo el modelo

In [117]:
val1 = pd.read_csv("../data/ensemble/val_train.csv",header=None,names=['title'])
val2 = pd.read_csv("../data/ensemble/val_test.csv",header=None,names=['title'])

In [118]:
validation_results = pd.concat([val1,val2])

In [120]:
validation_results.to_csv("../data/ensemble/val_full.csv", index=False, header=False)