In [1]:
import pandas as pd
import re
from tqdm import tqdm
import numpy as np
import nltk.data
from pandas_profiling import ProfileReport
import sys
sys.path.append("../src")
import Preprocessing
import fasttext

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
df = pd.read_csv("../resource/data/blogtext.csv", nrows= 10000)



### Filtering

In [2]:
# filter for a mininmal number of letters in a tweet:
df = df[df["text"].str.count(r"[a-zA-Z]") >= 10]
df = df.reset_index(drop=True)

### Feature Engineering

In [3]:
#def findDates(text):
#    try:
#        return len([date for date in\
#                    datefinder.find_dates(text)])
#    except:
#        return 0

In [4]:
def buildFeatures(text):
    text_split = text.split()
    len_text = len(text)
    sentence_split = tokenizer.tokenize(text)
    
    # find the number of urls in the text
    keywords = ["urlLink","http","www"]
    nb_urls = sum((any(keyword in pattern for keyword in keywords))\
               for pattern in text.split())
    # find the number of mails in the text
    nb_mails = len(re.findall(r"([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+|\bmail\b)"\
                      ,text))
    
    # find the number of dates in the text
#    nb_dates = findDates(text)
     
    # find characteristics about the usage of letters, numbers and symbols
    uppercase_ratio = len(re.findall(r'[A-Z]', text))/len_text
    lowercase_ratio = len(re.findall(r'[a-z]', text))/len_text
    number_ratio = len(re.findall(r'[0-9]', text))/len_text
    symbol_ratio = len(re.findall(r'[$-/:-?{-~!"^_`\[\]]', text))/len_text

    # find characteristics about the letters per word
    sentence_len_word = [len(word) for word in text_split]
    avg_letters_per_word = np.mean([len(word) for word in text_split])
    var_letters_per_word = np.var([len(word) for word in text_split])
    unique_words_ratio = len(set(text_split))/len(text_split)

    # find characteristics about the letters per sentence
    sentence_len_list = [len(sentence) for sentence in sentence_split]
    avg_letters_per_sentence = np.mean(sentence_len_list)
    var_letters_per_sentence = np.var(sentence_len_list)
    
    # find characteristics about the words per sentence
    words_per_sentence_len_list = [len(sentence.split()) for sentence in sentence_split]
    avg_words_per_sentence = np.mean(words_per_sentence_len_list)
    var_words_per_sentence = np.var(words_per_sentence_len_list)
    
    # find the trumps
    uppercase_per_sentence_ratio = [len(re.findall(r'[A-Z]', sentence))/len(sentence)\
                                    for sentence in sentence_split]
    max_sentence_uppercase_ratio = max(uppercase_per_sentence_ratio)
    max_sentence_uppercase_len = len(sentence_split[uppercase_per_sentence_ratio.index(max_sentence_uppercase_ratio)])
    
    return len_text, nb_urls, nb_mails,\
           uppercase_ratio, lowercase_ratio, number_ratio, symbol_ratio,\
           avg_letters_per_word, var_letters_per_word, unique_words_ratio,\
           avg_letters_per_sentence, var_letters_per_sentence,\
           avg_words_per_sentence, var_words_per_sentence,\
           max_sentence_uppercase_ratio, max_sentence_uppercase_len        
           

In [5]:
# append the data
features = [buildFeatures(text) for text  in tqdm(df["text"])]

# append the data
columns = ["Text length", "Number URLs", "Number mails",\
          "Uppercase ratio", "Lowercase ratio", "Number ratio", "Symbol ratio",\
          "Average letters per word", "Variance of letters per word", "Unique words ratio",\
          "Average letters per sentence", "Variance of letters per sentence",\
          "Average words per sentence", "Variance of words per sentence",\
          "Maximal uppercase ratio per sentence", "Length of the maximal uppercase ratio sentence"]

# merge the features with the original dataset
df_preprocessed = df.merge(pd.DataFrame(features, columns=columns), left_index=True, right_index=True)

100%|██████████| 9861/9861 [00:10<00:00, 924.85it/s] 


### Text Preprocessing

In [6]:
# use the preprocessing  module
preprocessor = Preprocessing.Preprocessing()
df_preprocessed["text_preprocessed"] = preprocessor.ProcessMany(df_preprocessed["text"])

# predict the main language
model = fasttext.load_model('../src/data/lid.176.ftz')
df_preprocessed["main_language"] = [model.predict(text)[0][0].split("__")[-1] for text in tqdm(df_preprocessed["text_preprocessed"])]

100%|██████████| 9861/9861 [03:53<00:00, 42.16it/s] 
100%|██████████| 9861/9861 [00:01<00:00, 8626.21it/s]


### Clustering

### Data Transformation

### Data-Splits

### Training 

### Model Evaluation 