In [1]:
import datetime
now = datetime.datetime.now()

# Parallelization

## Summer School JGU Mainz — Advanced Methods in Behavioral Economics, 2021

### Carina I. Hausladen

In [2]:
print(now.strftime("%Y-%m-%d"))

2021-09-25


# Goal
In this tutorial, we search for the best combination of X and y.

In [3]:
import itertools
import multiprocessing as mp
import pickle
import warnings
warnings.simplefilter("ignore")

import gensim
import pandas as pd
import spacy
from gensim.models.word2vec import Word2Vec
from imblearn.over_sampling import RandomOverSampler

from utils.setup import prepare_X_y
from utils.prepare_feat import prepare_feat

In [4]:
ros = RandomOverSampler(random_state=42)
warnings.simplefilter('ignore')
pd.set_option('max_colwidth', 1000)
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise."
workers = mp.cpu_count()

nlp = spacy.load("de_core_news_sm") # .venv/bin/python -m spacy download de
stop_words = spacy.lang.de.stop_words.STOP_WORDS

# Read and prepare df


In [5]:
df = pd.read_csv('data/chat_hours_simulated.csv')
df_spllchckd = pd.read_csv('data/chat_spll_hours_simulated.csv')

df = prepare_X_y(df, dv="player.hours_stated")
df_spllchckd = prepare_X_y(df_spllchckd,
                           dv="player.hours_stated")

# Define loop vars
🤓: uncomment those variations you are interested in and run the loop!

In [6]:
df_vars = {
    "duplicated": df,
 #   "deduplicated": df,
 #   "spell_checked": df_spllchckd
}

y_vars = {
     "<10": 'honest10',
 #    "<30": 'honest30',
 #    "<mean": 'honestmean'
}

X_vars = {
    "chat_subject": 'Chat_subject',
 #   "chat_group": 'Chat_group_all',
 #   "chat_selected": 'Chat_sel',
}

# Create jobs for loop 

We use [itertools.product](https://docs.python.org/3/library/itertools.html) to create the Cartesian product of input iterables.


In [7]:
results = []
jobs = list(itertools.product(*[df_vars.items(), y_vars.items(), X_vars.items()]))
print(len(jobs))

1


# Run Loop

Note: importing the function `prepare_feat` instead of defining it within the script makes a difference for joblib.

We use [multiprocessing](https://docs.python.org/3/library/multiprocessing.html) to fully leverage multiple processors on my machine. 

🤓: look into utils/prepare_feat.py. Which combination of vecotrizer and classifier is used?

In [8]:
pool = mp.Pool(10)
results = [pool.apply(prepare_feat, args=(df, d, y, x)) for d, y, x in jobs]  # paralell processing
pool.close()

  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,


duplicated <10 chat_subject
finished DocPreprocess
finished split
start gridsearch
finished gridsearch
F1: 0.388 | Pr: 0.292 | Re: 0.700 | AUC: 0.559 | Accuracy: 0.523 

15.648609638214111


# Save Results
🤓: Print best hyperparameter of your model.

In [9]:
with open('data/interim/df_y_x_over.pickle', 'wb') as fp:
    pickle.dump(results, fp)