In [108]:
import pandas as pd
import re
import numpy as np
from dicts import emoticon_dict, short_word_dict, apostrophe_dict # словари из материалов к ДЗ
import nltk

In [2]:
pd.set_option('max_colwidth', 600)

In [3]:
train_df = pd.read_csv('train_tweets.csv', index_col='id')
test_df = pd.read_csv('test_tweets.csv', index_col='id')

In [4]:
df = pd.concat([train_df, test_df])

In [5]:
train_df.size + test_df.size == df.size

False

In [6]:
df.tail(3)

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
49157,,"#hillary #campaigned today in #ohio((omg)) &amp; used words like ""assets&amp;liability"" never once did #clinton say thee(word) #radicalization"
49158,,"happy, at work conference: right mindset leads to culture-of-development organizations #work #mindset"
49159,,"my song ""so glad"" free download! #shoegaze #newmusic #newsong"


### 1. Удалим @user из всех твитов с помощью паттерна "@[\w]*". Для этого создадим функцию:
для того, чтобы найти все вхождения паттерна в тексте, необходимо использовать re.findall(pattern, input_txt) <br>
для для замены @user на пробел, необходимо использовать re.sub() при применении функции необходимо использовать np.vectorize(function).


In [7]:
text = "@user #sikh #temple @user1 vandalised in @username in #calgary, #wso condemns act"

In [8]:
re.findall("@[\w]*", text)

['@user', '@user1', '@username']

In [9]:
# re.sub Заменяет ВСЕ вхождения по строке
re.sub("@[\w]*", 'SUB_TEXT', text)

'SUB_TEXT #sikh #temple SUB_TEXT vandalised in SUB_TEXT in #calgary, #wso condemns act'

In [10]:
def remove_username(sentence):
    sentence = re.sub("@[\w]*", ' ', sentence)
    return sentence

In [11]:
remove_user_vectorize = np.vectorize(remove_username)

In [12]:
df['tweet_preprocessed'] = df['tweet'].apply(lambda x: remove_user_vectorize(x))

In [13]:
df.head(2)

Unnamed: 0_level_0,label,tweet,tweet_preprocessed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
2,0.0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked


### 2. Изменим регистр твитов на нижний с помощью .lower().


In [14]:
text = 'Text TEXT teXT'
text.lower()

'text text text'

In [15]:
df['tweet_preprocessed'] = df['tweet_preprocessed'].apply(lambda x: x.lower())

### 3. Заменим сокращения с апострофами (пример: ain't, can't) на пробел, используя apostrophe_dict. Для этого необходимо сделать функцию: для каждого слова в тексте проверить (for word in text.split()), если слово есть в словаре apostrophe_dict в качестве ключа (сокращенного слова), то заменить ключ на значение (полную версию слова). 

In [16]:
text = "some text can't bla bla he'd, text i'll again i'm."

In [17]:
text.split()

['some',
 'text',
 "can't",
 'bla',
 'bla',
 "he'd,",
 'text',
 "i'll",
 'again',
 "i'm."]

In [18]:
text

"some text can't bla bla he'd, text i'll again i'm."

за некоторомы словами следуют знаки препинания. Учтем  это, сохранив пунктуацию, потому что если поудалять её с самого начала, мы можем вырезать какие-нибудь смайлики или что-нибудь еще

In [19]:
for word in text.split():
    cleared_word = re.sub('[.!?&,]', '', word) # за некоторомы словами следуют знаки препинания
    if cleared_word in apostrophe_dict.keys():
        text = re.sub(cleared_word, apostrophe_dict[cleared_word], text)

In [20]:
text

'some text cannot bla bla he had / he would, text I shall / I will again I am.'

In [21]:
# def sub_by_dict(sentence, dictionary):
#     for word in sentence.split():
#         cleared_word = re.sub('[.!?&,]', '', word) # за некоторомы словами следуют знаки препинания
#         if cleared_word in dictionary.keys():
#             sentence = re.sub(cleared_word, dictionary[cleared_word], sentence)
#     return sentence

In [22]:
def sub_by_dict(sentence, dictionary):
    for word in sentence.split():
        cleared_word = re.sub('[.!?&,]', '', word) # за некоторомы словами следуют знаки препинания
        if cleared_word in dictionary.keys():
            sentence = sentence.replace(cleared_word, dictionary[cleared_word])
    return sentence

In [23]:
sub_by_dict_vectorize = np.vectorize(sub_by_dict)

In [24]:
df['tweet_preprocessed'] = df['tweet_preprocessed'].apply(lambda x: sub_by_dict_vectorize(x, apostrophe_dict))

In [25]:
df[599:600]

Unnamed: 0_level_0,label,tweet,tweet_preprocessed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
600,0.0,met this girl 5 yrs ago online b-ng gay friend. y does she look like this when she didn't b4? @user,met this girl 5 yrs ago online b-ng gay friend. y does she look like this when she did not b4?


### 4. Заменим сокращения на их полные формы, используя short_word_dict. Для этого воспользуемся функцией, используемой в предыдущем пункте.


In [26]:
text = 'some text g2g, wb text omg.'

In [27]:
sub_by_dict_vectorize(text, short_word_dict)

array('some text got to go, welcome back text oh my god.', dtype='<U49')

In [28]:
df['tweet_preprocessed'] = df['tweet_preprocessed'].apply(lambda x: sub_by_dict_vectorize(x, short_word_dict))

In [29]:
df[599:600]

Unnamed: 0_level_0,label,tweet,tweet_preprocessed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
600,0.0,met this girl 5 yrs ago online b-ng gay friend. y does she look like this when she didn't b4? @user,met this girl 5 yrs ago online b-ng gay friend. y does she look like this when she did not before?


### 5. Заменим эмотиконы (пример: ":)" = "happy") на пробелы, используя emoticon_dict. Для этого воспользуемся функцией, используемой в предыдущем пункте.

In [35]:
text = ' :‑). some :‑) text :) :('

In [36]:
sub_by_dict_vectorize(text, emoticon_dict)

array(' happy. some happy text happy sad', dtype='<U33')

In [37]:
df['tweet_preprocessed'] = df['tweet_preprocessed'].apply(lambda x: sub_by_dict_vectorize(x, emoticon_dict))

In [55]:
df[63:64]

Unnamed: 0_level_0,label,tweet,tweet_preprocessed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
64,0.0,you've really hu my feelings :(,you have really hu my feelings sad


### 6. Заменим пунктуацию на пробелы, используя re.sub() и паттерн r'[^\w\s]'.

In [56]:
def remove_punctuation(sentence):
    sentence = re.sub(r'[^\w\s]', ' ', sentence)
    return sentence

In [57]:
remove_punctuation_vectorize = np.vectorize(remove_punctuation)

In [58]:
text = 'some, text. is! here'

In [59]:
remove_punctuation_vectorize(text)

array('some  text  is  here', dtype='<U20')

In [60]:
df['tweet_preprocessed'] = df['tweet_preprocessed'].apply(lambda x: remove_punctuation_vectorize(x))

In [62]:
df.head(2)

Unnamed: 0_level_0,label,tweet,tweet_preprocessed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,when a father is dysfunctional and is so selfish he drags his kids into his dysfunction run
2,0.0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,thanks for lyft credit i cannot use cause they do not offer wheelchair vans in pdx disapointed getthanked


### 7. Заменим спец. символы на пробелы, используя re.sub() и паттерн r'[^a-zA-Z0-9]'

In [63]:
def remove_symbols(sentence):
    sentence = re.sub(r'[^a-zA-Z0-9]', ' ', sentence)
    return sentence

In [64]:
remove_symbols_vectorize = np.vectorize(remove_symbols)

In [66]:
text = '#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦'



In [67]:
remove_symbols_vectorize(text)

array(' model i love u take with u all the time in ur                                    ',
      dtype='<U82')

In [69]:
df['tweet_preprocessed'] = df['tweet_preprocessed'].apply(lambda x: remove_symbols_vectorize(x))

In [75]:
df[3:4]

Unnamed: 0_level_0,label,tweet,tweet_preprocessed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,0.0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦,model i love yoyou take with yoyou all the time in yoyour


### 8. Заменим числа на пробелы, используя re.sub() и паттерн r'[^a-zA-Z]'.

In [77]:
def remove_numbers(sentence):
    sentence = re.sub(r'[^a-zA-Z]', ' ', sentence)
    return sentence

In [78]:
remove_numbers_vectorize = np.vectorize(remove_numbers)

In [79]:
text = 'some1 text 4 !4'

In [80]:
remove_numbers_vectorize(text)

array('some  text     ', dtype='<U15')

In [81]:
df['tweet_preprocessed'] = df['tweet_preprocessed'].apply(lambda x: remove_numbers_vectorize(x))

In [85]:
df.loc[df['tweet'].str.contains('1')].head(1)

Unnamed: 0_level_0,label,tweet,tweet_preprocessed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
21,0.0,#euro2016 people blaming ha for conceded goal was it fat rooney who gave away free kick knowing bale can hit them from there.,euro people blaming ha for conceded goal was it fat rooney who gave away free kick knowing bale can hit them from there


### 9. Удалим из текста слова длиной в 1 символ, используя ' '.join([w for w in x.split() if len(w)>1])

In [95]:
def remove_short(sentence, min_len=1):
    cleared_text = ' '.join([word for word in sentence.split() if len(word) > min_len])
    return cleared_text

In [98]:
remove_short_vectorize = np.vectorize(remove_short)

In [99]:
text = 'some text i with m short words s'

In [100]:
remove_short_vectorize(text)

array('some text with short words', dtype='<U26')

In [101]:
df['tweet_preprocessed'] = df['tweet_preprocessed'].apply(lambda x: remove_short_vectorize(x))

In [104]:
df.loc[df['tweet'].str.contains('i ')].head(1)

Unnamed: 0_level_0,label,tweet,tweet_preprocessed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,0.0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,thanks for lyft credit cannot use cause they do not offer wheelchair vans in pdx disapointed getthanked


### 10. Поделим твиты на токены с помощью nltk.tokenize.word_tokenize, создав новый столбец 'tweet_token'.

In [111]:
df['tweet_token'] = df['tweet_preprocessed'].apply(lambda x: nltk.tokenize.word_tokenize(x))

### 11. Удалим стоп-слова из токенов, используя nltk.corpus.stopwords. Создадим столбец 'tweet_token_filtered' без стоп-слов.

In [119]:
stopwords = nltk.corpus.stopwords.words('english')

In [121]:
df['tweet_token_filtered'] = df['tweet_token'].apply(lambda x: [word for word in x if word not in stopwords])

### 12. Применим стемминг к токенам с помощью nltk.stem.PorterStemmer. Создадим столбец 'tweet_stemmed' после применения стемминга.


In [125]:
stemmer = nltk.stem.PorterStemmer()

In [130]:
df['tweet_stemmed'] = df['tweet_token_filtered'].apply(lambda x: [stemmer.stem(word) for word in x])

### 13. Применим лемматизацию к токенам с помощью nltk.stem.wordnet.WordNetLemmatizer. Создадим столбец 'tweet_lemmatized' после применения лемматизации.

In [131]:
lemmatizer = nltk.stem.WordNetLemmatizer()

In [134]:
df['tweet_lemmatized'] = df['tweet_token_filtered'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [135]:
df.head(3)

Unnamed: 0_level_0,label,tweet,tweet_preprocessed,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,when father is dysfunctional and is so selfish he drags his kids into his dysfunction run,"[when, father, is, dysfunctional, and, is, so, selfish, he, drags, his, kids, into, his, dysfunction, run]","[father, dysfunctional, selfish, drags, kids, dysfunction, run]","[father, dysfunct, selfish, drag, kid, dysfunct, run]","[father, dysfunctional, selfish, drag, kid, dysfunction, run]"
2,0.0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,thanks for lyft credit cannot use cause they do not offer wheelchair vans in pdx disapointed getthanked,"[thanks, for, lyft, credit, can, not, use, cause, they, do, not, offer, wheelchair, vans, in, pdx, disapointed, getthanked]","[thanks, lyft, credit, use, cause, offer, wheelchair, vans, pdx, disapointed, getthanked]","[thank, lyft, credit, use, caus, offer, wheelchair, van, pdx, disapoint, getthank]","[thanks, lyft, credit, use, cause, offer, wheelchair, van, pdx, disapointed, getthanked]"
3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesty]"


### 14. Сохраним результат предобработки в pickle-файл.


In [136]:
df.to_pickle('tweets_.pickle')