In [1]:
# universally modules
import sys
sys.path.append("../src")
import numpy as np
import pandas as pd
from tqdm import tqdm
import dask.dataframe as dd
from dask.distributed import Client
client = Client()

# preprocessing and transformation modules
import fasttext
import Preprocessing
from Features import buildFeatures
from Modelling import StackingModelling
from ModelSelection import ModelSelection, process_case
from Transformation import StackedTransformation, transformation

# Scikit-Learn
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer

# model algorithm
from sklearn.svm import LinearSVC, LinearSVR
from xgboost import XGBRegressor, XGBClassifier
from sklearn.linear_model import SGDClassifier, SGDRegressor

# evaluation modules
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report



### Parameters

In [2]:
# to speed up the process choose a sample size to randomly draw a sample of the whole daataset
sample_mode = False
sample_size = None 

# remove all text that contain less than n chars
min_chars_per_text = 50

# which features will be used for the TF-IDF transformation
text_features = "text_preprocessed"

# General Preprocessing

This part is independent from the cases

### Initialization 

In [3]:
df = pd.read_csv("../resource/data/blogtext.csv")

# draw random sample for faster processing:
if sample_mode == True:
    df = df.sample(sample_size)

### Filtering

In [4]:
# filter for a mininmal number of letters in a tweet:
df = df[df["text"].str.count(r"[a-zA-Z]") >= min_chars_per_text]
df = df.reset_index(drop=True)

# transform to dask
ds = dd.from_pandas(df["text"], npartitions = 8)

### Feature Engineering

In [5]:
# append the data
features = list(ds.apply(buildFeatures).compute())

# merge the features with the original dataset
df_preprocessed = df.merge(pd.DataFrame(features), left_index=True, right_index=True)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('text', 'object'))



### Text Preprocessing

In [6]:
# use the preprocessing  module
preprocessing = Preprocessing.Preprocessing()

df_preprocessed["text_preprocessed"] = ds.apply(preprocessing.ProcessOne, meta=("text", str)).compute()

  ("('from_pandas-5260b287f7aa7ba30ef88301e891ad74', ... 88301e891ad74')
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good


In [7]:
# predict the main language
model = fasttext.load_model('../src/data/lid.176.ftz')
df_preprocessed["main_language"] = [model.predict(text)[0][0].split("__")[-1] for text in df_preprocessed["text_preprocessed"]]

# drop unnecassary features
df_filtered = df_preprocessed[(df_preprocessed["main_language"] == "en")]\
                .drop(["id","text","date","main_language"], axis= 1)



In [8]:
# store df as pickle
df_filtered.to_pickle("../resource/df_full_preprocessed.pkl")