In [None]:
import sys
import pickle
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
from dvclive import Live

live = Live(save_dvc_exp=True)

In [None]:
import pathlib
 
# current working directory
print(pathlib.Path().absolute())

### PARAMS

In [None]:
UTILS_DIR = "./"
DATA_DIR = '../data/'
DATA_NAME =  'train_data_cleaning.csv'

### Added by DataScientists

### Added by DataScientists

### Added by DataScientists

In [None]:
sys.path.append(UTILS_DIR)

from utils.clean_text import clean_text
from utils.counter_word import counter_word
from utils.plot_target import plot_target

### Added by DataScientists

## DATA PREPROCESSING

In [None]:
train_df = pd.read_csv(DATA_DIR + DATA_NAME)

In [None]:
train_df.info()

In [None]:
train_df.isnull().sum()

In [None]:
train_df.head()

In [None]:
train_df.describe()

In [None]:
train_df.location.value_counts()[:10]

In [None]:
counts =train_df['target'].value_counts()
sns.barplot(x=counts.index, y=counts)
plt.xlabel('Target')
plt.ylabel('Count')

In [None]:
train_df['Text_length'] = train_df['text'].apply(len)

In [None]:
length_text = sns.FacetGrid(data=train_df, col='target')
length_text.map(plt.hist, 'Text_length', bins=20, color='r')

In [None]:
train_df.drop(['id','location','keyword'], axis=1, inplace=True)

In [None]:
train_df.isnull().sum()

In [None]:
plot_target(train_df, "Text_length")

In [None]:
train_df['words_counts'] = train_df.text.str.split().map(lambda x: len(x))

In [None]:
train_df.head()

In [None]:
plot_target(train_df, "words_counts")

In [None]:
train_df['unique_word_count'] = train_df.text.map(lambda x: len(set(str(x).split())))

In [None]:
train_df.head()

In [None]:
plot_target(train_df, 'unique_word_count')

In [None]:
train_df['punctuation_count'] = train_df['text'].map(lambda x: len([c for c in str(x) if c in string.punctuation]))

In [None]:
train_df.head()

In [None]:
plot_target(train_df, 'punctuation_count')

In [None]:
train_df['Text_cleaning'] = train_df.text.apply(clean_text)

In [None]:
x_1 = train_df[train_df.target == 1]["Text_cleaning"]
x_0 = train_df[train_df.target == 0]["Text_cleaning"]

In [None]:
X = train_df['Text_cleaning']
y = train_df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
count_vectorizer = CountVectorizer()
train_counts = count_vectorizer.fit_transform(train_df['Text_cleaning'])

In [None]:
text = train_df['Text_cleaning']
counter = counter_word(text)

In [None]:
len(counter)

In [None]:
live.log_metric("len_counter", len(counter))

### OUTPUTS

In [None]:
pickle.dump(X, open(DATA_DIR + "X.pckl", "wb"))
pickle.dump(X_train, open(DATA_DIR + "X_train.pckl", "wb"))
pickle.dump(X_test, open(DATA_DIR +"X_test.pckl", "wb"))

pickle.dump(y, open(DATA_DIR + "y.pckl", "wb"))
pickle.dump(y_train, open(DATA_DIR + "y_train.pckl", "wb"))
pickle.dump(y_test, open(DATA_DIR + "y_test.pckl", "wb"))

In [None]:
pickle.dump(counter, open(DATA_DIR + "counter.pckl", "wb"))

### Added by DataScientists

### Added by DataScientists