In [2]:
import os
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize, wordpunct_tokenize
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
from dictionaries import apostrophe_dict, emoticon_dict, short_word_dict

In [4]:
train_df = pd.read_csv('train_tweets.csv')
train_df.head(2)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...


In [5]:
test_df = pd.read_csv('test_tweets.csv')
test_df.head(2)

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...


In [6]:
combine_df = train_df.append(test_df, ignore_index=True, sort=False)
combine_df.head()

Unnamed: 0,id,label,tweet
0,1,0.0,@user when a father is dysfunctional and is s...
1,2,0.0,@user @user thanks for #lyft credit i can't us...
2,3,0.0,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation


In [7]:
print(combine_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49159 entries, 0 to 49158
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      49159 non-null  int64  
 1   label   31962 non-null  float64
 2   tweet   49159 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 1.1+ MB
None


1. Удалим @user из всех твитов с помощью паттерна "@[\w]*". Для этого создадим функцию: 
 - для того, чтобы найти все вхождения паттерна в тексте, необходимо использовать re.findall(pattern, input_txt)
 - для для замены @user на пробел, необходимо использовать re.sub()
при применении функции необходимо использовать np.vectorize(function)


In [8]:
def pattern_process(input_text: str, pattern: str, repl=" ") -> str:
    item = re.compile(pattern=pattern)
    item_list = item.findall(input_text)
    new_text = re.sub(item, repl, input_text)
    return new_text if len(new_text) > 0 else input_text

In [9]:
pattern_func = np.vectorize(pattern_process)
combine_df['tweet_new'] = pattern_func(combine_df['tweet'], pattern='@[\w]*')
combine_df.head(2)

Unnamed: 0,id,label,tweet,tweet_new
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so se...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can't use cause ...


2. Изменим регистр твитов на нижний с помощью .lower()

In [10]:
combine_df['tweet_new'] = combine_df['tweet_new'].apply(lambda x: x.lower())

3. Заменим сокращения с апострофами (пример: ain't, can't) на пробел, используя apostrophe_dict. Для этого необходимо сделать функцию: для каждого слова в тексте проверить (for word in text.split()), если слово есть в словаре apostrophe_dict в качестве ключа (сокращенного слова), то заменить ключ на значение (полную версию слова).

In [11]:
def word_from_dict(input_text: str, text_dict: dict) -> str:
    text_list = input_text.split()
    for num, word in enumerate(text_list):
        try:
            new_form = text_dict[word]
            text_list[num] = new_form
        except KeyError:
            continue
    return ' '.join(text_list)

In [12]:
dict_func = np.vectorize(word_from_dict)
combine_df['tweet_new'] = dict_func(combine_df['tweet_new'], apostrophe_dict)
combine_df.loc[1:1]

Unnamed: 0,id,label,tweet,tweet_new
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i cannot use cause the...


4. Заменим сокращения на их полные формы, используя short_word_dict. Для этого воспользуемся функцией, используемой в предыдущем пункте.

In [13]:
combine_df['tweet_new'] = dict_func(combine_df['tweet_new'], short_word_dict)
combine_df.loc[3:3]

Unnamed: 0,id,label,tweet,tweet_new
3,4,0.0,#model i love u take with u all the time in ...,#model i love you take with you all the time i...


5. Заменим эмотиконы (пример: ":)" = "happy") на пробелы, используя emoticon_dict. Для этого воспользуемся функцией, используемой в предыдущем пункте.

In [14]:
combine_df['tweet_new'] = dict_func(combine_df['tweet_new'], emoticon_dict)

6. Заменим пунктуацию на пробелы, используя re.sub() и паттерн r'[^\w\s]'

In [15]:
combine_df['tweet_new'] = pattern_func(
    combine_df['tweet_new'], pattern=r'[^\w\s]')
combine_df.loc[8:8]

Unnamed: 0,id,label,tweet,tweet_new
8,9,0.0,we won!!! love the land!!! #allin #cavs #champ...,we won love the land allin cavs champ...


7. Заменим спец. символы на пробелы, используя re.sub() и паттерн r'[^a-zA-Z0-9]'

In [16]:
combine_df['tweet_new'] = pattern_func(
    combine_df['tweet_new'], pattern=r'[^a-zA-Z0-9]')
combine_df.loc[10:10]

Unnamed: 0,id,label,tweet,tweet_new
10,11,0.0,â #ireland consumer price index (mom) climb...,ireland consumer price index mom climbe...


8. Заменим числа на пробелы, используя re.sub() и паттерн r'[^a-zA-Z]'

In [17]:
combine_df['tweet_new'] = pattern_func(
    combine_df['tweet_new'], pattern=r'[^a-zA-Z]')
combine_df.loc[5:5]

Unnamed: 0,id,label,tweet,tweet_new
5,6,0.0,[2/2] huge fan fare and big talking before the...,huge fan fare and big talking before the...


9. Удалим из текста слова длиной в 1 символ, используя ' '.join([w for w in x.split() if len(w)>1])

In [18]:
combine_df['tweet_new'] = combine_df['tweet_new'].apply(
    lambda x: ' '.join([w for w in x.split() if len(w) > 1]))
combine_df.loc[55:55]

Unnamed: 0,id,label,tweet,tweet_new
55,56,0.0,a scourge on those playing baroque pieces on p...,scourge on those playing baroque pieces on pia...


10. Поделим твиты на токены с помощью nltk.tokenize.word_tokenize, создав новый столбец 'tweet_token'.

In [19]:
combine_df['tweet_token'] = combine_df['tweet_new'].apply(
    nltk.tokenize.word_tokenize)
combine_df.head(2)

Unnamed: 0,id,label,tweet,tweet_new,tweet_token
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau..."


11. Удалим стоп-слова из токенов, используя nltk.corpus.stopwords. Создадим столбец 'tweet_token_filtered' без стоп-слов.

In [20]:
from nltk.corpus import stopwords

In [21]:
stop_words = set(stopwords.words("english"))
combine_df['tweet_token_filtered'] = combine_df['tweet_token'].apply(
    lambda x: [word for word in x if not word in stop_words])
combine_df.head(2)

Unnamed: 0,id,label,tweet,tweet_new,tweet_token,tweet_token_filtered
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee..."


12. Применим стемминг к токенам с помощью nltk.stem.PorterStemmer. Создадим столбец 'tweet_stemmed' после применения стемминга.

In [22]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [23]:
stemmer = PorterStemmer()

combine_df['tweet_stemmed'] = combine_df['tweet_token_filtered'].apply(
    lambda x: [stemmer.stem(word) for word in x])
combine_df.head(2)

Unnamed: 0,id,label,tweet,tweet_new,tweet_token,tweet_token_filtered,tweet_stemmed
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc..."


13. Применим лемматизацию к токенам с помощью nltk.stem.wordnet.WordNetLemmatizer. Создадим столбец 'tweet_lemmatized' после применения лемматизации.

In [24]:
lemmatizer = WordNetLemmatizer()

combine_df['tweet_lemmatized'] = combine_df['tweet_token_filtered'].apply(
    lambda x: [lemmatizer.lemmatize(word) for word in x])
combine_df.head(2)

Unnamed: 0,id,label,tweet,tweet_new,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thanks, lyft, credit, use, cause, offer, whee..."


14. Сохраним результат предобработки в pickle-файл.

In [25]:
combine_df.to_pickle('cleaned_tweets.pkl')