## Imports of functions that will help us clean the dataset

In [14]:
import sys

sys.path.append("..")

from src import cleaning
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import word_tokenize

Path to dataset subset with 100000 lines

In [16]:
path = "../data/raw/bigger_sample.csv"

df = pd.read_csv(
    path,
    sep=";",
    usecols=[2, 3],
    names=["commit_message", "is_bot"],
    encoding_errors="ignore",
    dtype=str,
)

In [17]:
df

Unnamed: 0,commit_message,is_bot
0,Initial AUR Archive commit of 'ros-hydro-roste...,BOT
1,fix(package): update gatsby-source-filesystem ...,BOT
2,docs(readme): add Greenkeeper badge,BOT
3,chore(deps): update dependency webpack to v4.1...,BOT
4,Update boto3 from 1.9.64 to 1.9.127,BOT
...,...,...
99995,version 0.2.5 - settings\,NON-BOT
99996,Delete RD_jessie_4.jpg,NON-BOT
99997,save log in other database bug fix,NON-BOT
99998,Merge pull request #105 from neilvyas/patch-1\...,NON-BOT


Tested cleaning functions to remove line breaks, chinese/japanese characters, emojis and other weird stuff like ø

In [18]:
out = []
for x in tqdm(df["commit_message"]):
    x = cleaning.replace_linebreaks(str(x))
    x = cleaning.remove_emojis(x)
    x = cleaning.replace_hash(x)
    x = cleaning.alphanum(x)
    if x:
        out.append(" ".join(word_tokenize(x)))
    else:
        out.append(None)
df["commit_message"] = out

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:14<00:00, 7093.46it/s]


In [19]:
df.dropna(inplace=True)
df

Unnamed: 0,commit_message,is_bot
0,Initial AUR Archive commit of 'ros-hydro-roste...,BOT
1,fix ( package ) : update gatsby-source-filesys...,BOT
2,docs ( readme ) : add Greenkeeper badge,BOT
3,chore ( deps ) : update dependency webpack to ...,BOT
4,Update boto3 from 1.9.64 to 1.9.127,BOT
...,...,...
99995,version 0.2.5 - settings\,NON-BOT
99996,Delete RD_jessie_4.jpg,NON-BOT
99997,save log in other database bug fix,NON-BOT
99998,Merge pull request # 105 from neilvyas/patch-1...,NON-BOT


## Testing whether we can use Accuracy as a metric -- classes better be balanced

In [24]:
sum(df["is_bot"] == "BOT") / sum(df["is_bot"] != "BOT")

1.0293409694555113

classes are balanced, saving the preprocced data and copying the preprocessing script into the DataModule.

In [20]:
df.to_csv(
    "../data/processed/bigger_sample.csv",
    index=False,
)