In [1]:
import nltk
import string
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('datasets/brand_tweets.csv')
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [3]:
df.drop(df[df.tweet_text.isnull()].index, inplace=True)
df.shape

(3904, 3)

In [4]:
sample_str = df.tweet_text.values[0]
sample_str

'.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.'

In [5]:
print('== Исходный текст == \n%s\n\n' % sample_str)

tokenized_str = nltk.word_tokenize(sample_str)

print('== Токенизированный текст == \n%s\n\n' % tokenized_str)

== Исходный текст == 
.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.


== Токенизированный текст == 
['.', '@', 'wesley83', 'I', 'have', 'a', '3G', 'iPhone', '.', 'After', '3', 'hrs', 'tweeting', 'at', '#', 'RISE_Austin', ',', 'it', 'was', 'dead', '!', 'I', 'need', 'to', 'upgrade', '.', 'Plugin', 'stations', 'at', '#', 'SXSW', '.']




In [6]:
tokens = [i.lower() for i in tokenized_str if (i not in string.punctuation)]
print(tokens)

['wesley83', 'i', 'have', 'a', '3g', 'iphone', 'after', '3', 'hrs', 'tweeting', 'at', 'rise_austin', 'it', 'was', 'dead', 'i', 'need', 'to', 'upgrade', 'plugin', 'stations', 'at', 'sxsw']


In [7]:
stop_words = nltk.corpus.stopwords.words('english')
# stop_words

In [8]:
filtered_tokens = [i for i in tokens if (i not in stop_words)]
print(filtered_tokens)

['wesley83', '3g', 'iphone', '3', 'hrs', 'tweeting', 'rise_austin', 'dead', 'need', 'upgrade', 'plugin', 'stations', 'sxsw']


In [9]:
def tokenize_text(raw_text: str):
    """
    Функция для токенизации текста
    :param raw_text: исходная текстовая строка
    """
    tokenized_str = nltk.word_tokenize(raw_text)
    tokens = [i.lower() for i in tokenized_str if (i not in string.punctuation)]
    filtered_tokens = [i for i in tokens if (i not in stop_words)]
    return filtered_tokens

In [10]:
tokenized_tweets = df.tweet_text.apply(tokenize_text)
df = df.assign(tokenized=tokenized_tweets)

In [11]:
df.tokenized.head()

0    [wesley83, 3g, iphone, 3, hrs, tweeting, rise_...
1    [jessedee, know, fludapp, awesome, ipad/iphone...
2        [swonderlin, wait, ipad, 2, also, sale, sxsw]
3    [sxsw, hope, year, 's, festival, n't, crashy, ...
4    [sxtxstate, great, stuff, fri, sxsw, marissa, ...
Name: tokenized, dtype: object

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer=tokenize_text)
document_matrix = vectorizer.fit_transform(df.tweet_text.values)

In [13]:
document_matrix

<3904x7258 sparse matrix of type '<class 'numpy.int64'>'
	with 46023 stored elements in Compressed Sparse Row format>

In [14]:
dir(CountVectorizer)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_validate_data',
 '_validate_ngram_range',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',
 '_white_spaces',
 '_word_ngrams',
 'build_analyzer',
 'build_preprocessor',
 'build_tokenizer',
 'decode',
 'fit',
 'fit_tra

In [15]:
source_tweet_index = 0

df.tweet_text.values[source_tweet_index]

'.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.'

In [16]:
from sklearn.metrics import pairwise_distances

tweet_distance = 1 - pairwise_distances(
    document_matrix, 
    metric='cosine')

In [17]:
tweet_distance.shape

(3904, 3904)

In [18]:
import numpy as np
sorted_similarity = np.argsort(-tweet_distance[source_tweet_index, :])
sorted_similarity

array([   0,  633,  420, ...,  579, 3771, 1330])

In [19]:
print(df.iloc[source_tweet_index].tweet_text)
print('-------------')
print(df.iloc[sorted_similarity[1]].tweet_text)
print('-------------')
print(df.iloc[sorted_similarity[2]].tweet_text)
print('-------------')
print(df.iloc[sorted_similarity[3]].tweet_text)
print('-------------')

.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.
-------------
.@mention I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.
-------------
IPhone is dead. Find me on the secret batphone #sxsw.
-------------
The big takeaway from #SXSW interactive - I need an iphone.
-------------


In [20]:
from gensim.models import Word2Vec
import logging

logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s', 
    level=logging.INFO)

In [21]:
texts = df.tokenized.values
texts[:3]

array([list(['wesley83', '3g', 'iphone', '3', 'hrs', 'tweeting', 'rise_austin', 'dead', 'need', 'upgrade', 'plugin', 'stations', 'sxsw']),
       list(['jessedee', 'know', 'fludapp', 'awesome', 'ipad/iphone', 'app', "'ll", 'likely', 'appreciate', 'design', 'also', "'re", 'giving', 'free', 'ts', 'sxsw']),
       list(['swonderlin', 'wait', 'ipad', '2', 'also', 'sale', 'sxsw'])],
      dtype=object)

In [26]:
# CBOW
model = Word2Vec(
    texts, 
    vector_size=10, 
    window=7, 
    min_count=2,
    workers=7, 
    epochs=10, 
    sg=0)

2023-06-16 06:35:18,125 : INFO : collecting all words and their counts
2023-06-16 06:35:18,126 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-06-16 06:35:18,137 : INFO : collected 7253 word types from a corpus of 48371 raw words and 3904 sentences
2023-06-16 06:35:18,138 : INFO : Creating a fresh vocabulary
2023-06-16 06:35:18,188 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 3083 unique words (42.51% of original 7253, drops 4170)', 'datetime': '2023-06-16T06:35:18.188931', 'gensim': '4.3.1', 'python': '3.8.12 | packaged by conda-forge | (default, Sep 29 2021, 19:21:23) \n[Clang 11.1.0 ]', 'platform': 'macOS-13.4-arm64-arm-64bit', 'event': 'prepare_vocab'}
2023-06-16 06:35:18,189 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 44201 word corpus (91.38% of original 48371, drops 4170)', 'datetime': '2023-06-16T06:35:18.189387', 'gensim': '4.3.1', 'python': '3.8.12 | packaged by conda-forge | (default, Sep

In [23]:
model

<gensim.models.word2vec.Word2Vec at 0x282d26490>

In [24]:
model.wv.get_vector('android')

array([ 2.059951  ,  0.08012879,  1.7577075 , -0.86151916,  1.249628  ,
       -0.03899164,  2.2073784 ,  1.974332  , -1.2460318 , -2.0890038 ],
      dtype=float32)

In [25]:
model.wv.most_similar('android')

[('blackberry', 0.9929759502410889),
 ('iphone', 0.9861955046653748),
 ('witnessed', 0.9823786616325378),
 ('cheers', 0.9805057048797607),
 ('q', 0.9786323308944702),
 ('amp', 0.9769442677497864),
 ('hootsuite', 0.9755609631538391),
 ('ranking', 0.9753310084342957),
 ('shared', 0.9746308326721191),
 ('pocket', 0.9744592905044556)]

In [27]:
# SKIP GRAM
model = Word2Vec(
    texts, 
    vector_size=10, 
    window=7, 
    min_count=2,
    workers=7, 
    epochs=10, 
    sg=1)

2023-06-16 06:36:22,195 : INFO : collecting all words and their counts
2023-06-16 06:36:22,196 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-06-16 06:36:22,208 : INFO : collected 7253 word types from a corpus of 48371 raw words and 3904 sentences
2023-06-16 06:36:22,208 : INFO : Creating a fresh vocabulary
2023-06-16 06:36:22,259 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 3083 unique words (42.51% of original 7253, drops 4170)', 'datetime': '2023-06-16T06:36:22.259209', 'gensim': '4.3.1', 'python': '3.8.12 | packaged by conda-forge | (default, Sep 29 2021, 19:21:23) \n[Clang 11.1.0 ]', 'platform': 'macOS-13.4-arm64-arm-64bit', 'event': 'prepare_vocab'}
2023-06-16 06:36:22,259 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 44201 word corpus (91.38% of original 48371, drops 4170)', 'datetime': '2023-06-16T06:36:22.259725', 'gensim': '4.3.1', 'python': '3.8.12 | packaged by conda-forge | (default, Sep

In [28]:
model.wv.most_similar('iphone')

[('app', 0.9378886818885803),
 ('show', 0.9139389395713806),
 ('grape', 0.9023708701133728),
 ('share', 0.894133985042572),
 ('see', 0.8904950618743896),
 ('guide', 0.8875192403793335),
 ('get', 0.8871597647666931),
 ('working', 0.8865310549736023),
 ('check', 0.8849830627441406),
 ('must', 0.8840959668159485)]