#Bag of related words
This program performs the creation of a bag of related words based on user's input in the form of a csv file sourced from the Twitter API.

In [None]:
#Step 1: Retrieve CSV file data

import pandas as pd

from google.colab import files
uploaded = files.upload()

import io
csv = pd.read_csv(io.BytesIO(uploaded['data-science-complete-2019.csv']))
# Dataset is now stored in a dataframe

Saving data-science-complete-2019.csv to data-science-complete-2019 (2).csv


In [None]:
#Step 2: Based on Twitter's CSV files , this code will drop unused columns for the bag of related words creation

csv = csv.drop(csv.columns[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22]], axis=1)


Filtered dataframe

In [None]:
#Show the filtered datafram
csv

Unnamed: 0,Tweet Text
0,#Robotica #Arduino #roboticaeducativa #progr...
1,#Robotica #Arduino #roboticaeducativa #progr...
2,CAMPUS DE SEMANA SANTA TECNOLoGICOS en los 1...
3,Conoce las claves para el desarrollo de proy...
4,Este sabado volvemos a la carga con #Oracle4...
...,...
514,⚠⚡ELECTROKIT de PROGRAMACIÓN ⚡⚠ 🚩a un increíbl...
515,💻”La programación es el medio para lograr nues...
516,📌Tienes pensado asistir al #TWCG19 desde #Ceut...
517,"🕵️‍♂️ ¡Búsqueda Destacada! ▶️ ""Data Scientist""..."


In [None]:
#Step 3: Perform text cleansing, show the cleaned text
import re


csv['Tweet Text'] = csv['Tweet Text'].str.strip()
csv['Tweet Text'] = csv['Tweet Text'].str.lower()

csv['Tweet Text'] = csv['Tweet Text'].str.replace('[^\w\s]','')
csv['Tweet Text'] = csv['Tweet Text'].str.strip()

print(csv)

                                            Tweet Text
0    robotica arduino roboticaeducativa programacio...
1    robotica arduino roboticaeducativa programacio...
2    campus de semana santa tecnologicos en los 17 ...
3    conoce las claves para el desarrollo de proyec...
4    este sabado volvemos a la carga con oracle4gir...
..                                                 ...
514  electrokit de programación  a un increíble pre...
515  la programación es el medio para lograr nuestr...
516  tienes pensado asistir al twcg19 desde ceuta e...
517  búsqueda destacada  data scientist envíanos tu...
518  flipad hoy ha surgido sobre la marcha este pro...

[519 rows x 1 columns]


  


In [None]:
#Step 4: Remove stopwords in spanish from the dataframe
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = set(stopwords.words("spanish"))

csv['tweet_without_stopwords'] = csv['Tweet Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
print(csv['tweet_without_stopwords'])

0      robotica arduino roboticaeducativa programacio...
1      robotica arduino roboticaeducativa programacio...
2      campus semana santa tecnologicos 17 centros co...
3      conoce claves desarrollo proyectos bigdata rec...
4      sabado volvemos carga oracle4girls barcelona q...
                             ...                        
514    electrokit programación increíble precio pierd...
515    programación medio lograr objetivos pequeños a...
516    pensado asistir twcg19 ceuta suerte puedes dis...
517    búsqueda destacada data scientist envíanos cv ...
518    flipad hoy surgido marcha proyecto final clase...
Name: tweet_without_stopwords, Length: 519, dtype: object


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#Show the dataframe column without the stopwords
csv

Unnamed: 0,Tweet Text,tweet_without_stopwords
0,robotica arduino roboticaeducativa programacio...,robotica arduino roboticaeducativa programacio...
1,robotica arduino roboticaeducativa programacio...,robotica arduino roboticaeducativa programacio...
2,campus de semana santa tecnologicos en los 17 ...,campus semana santa tecnologicos 17 centros co...
3,conoce las claves para el desarrollo de proyec...,conoce claves desarrollo proyectos bigdata rec...
4,este sabado volvemos a la carga con oracle4gir...,sabado volvemos carga oracle4girls barcelona q...
...,...,...
514,electrokit de programación a un increíble pre...,electrokit programación increíble precio pierd...
515,la programación es el medio para lograr nuestr...,programación medio lograr objetivos pequeños a...
516,tienes pensado asistir al twcg19 desde ceuta e...,pensado asistir twcg19 ceuta suerte puedes dis...
517,búsqueda destacada data scientist envíanos tu...,búsqueda destacada data scientist envíanos cv ...


In [None]:
#Step 5: Apply stemming to the "tweet_without_stopwords" column
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language ='spanish')
csv['tweet_without_stopwords'] = csv['tweet_without_stopwords'].str.split()


In [None]:
#Drop the original column, leaving only the column that will be used to perform the Bag of Related Words
csv['tweet_without_stopwords'] = csv['tweet_without_stopwords'].apply(lambda x: [stemmer.stem(y) for y in x])
csv = csv.drop(columns=['Tweet Text'])

In [None]:
# Show the final dataframe
csv

Unnamed: 0,tweet_without_stopwords
0,"[robot, arduin, roboticaeduc, programacion, cu..."
1,"[robot, arduin, roboticaeduc, programacion, cu..."
2,"[campus, seman, sant, tecnolog, 17, centr, cod..."
3,"[conoc, clav, desarroll, proyect, bigdat, reco..."
4,"[sab, volv, carg, oracle4girls, barcelon, qued..."
...,...
514,"[electrokit, program, increibl, preci, pierd, ..."
515,"[program, medi, logr, objet, pequeñ, aprend, p..."
516,"[pens, asist, twcg19, ceut, suert, pued, disfr..."
517,"[busqued, destac, dat, scientist, envian, cv, ..."


In [None]:
#Step 6: Convert the dataframe to a list to be used to perform association rules.
Tweets = list(csv['tweet_without_stopwords'])
print(type(Tweets))

<class 'list'>


In [None]:
#Step 7: Association rule through support measure where sup(A ⇒ B) = q(A ∪ B)/Q in which q(A ∪ B) is the number of transactions that A and B occur together

from mlxtend.frequent_patterns.association_rules import association_rules
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

te = TransactionEncoder()
te_data = te.fit(Tweets).transform(Tweets)
df = pd.DataFrame(te_data,columns=te.columns_)

df1 = apriori(df,min_support=0.01,use_colnames=True)
print(df1)
print(df1.sort_values(by="support",ascending = False))


         support                                           itemsets
0       0.021195                                                (1)
1       0.013487                                               (10)
2       0.013487                                               (11)
3       0.011561                                                (2)
4       0.013487                                               (20)
...          ...                                                ...
271246  0.011561  (tensorflow, iiot, linux, datascienc, pytorch,...
271247  0.011561  (tensorflow, iiot, linux, datascienc, pytorch,...
271248  0.011561  (tensorflow, iiot, linux, datascienc, pytorch,...
271249  0.011561  (tensorflow, iiot, linux, datascienc, pytorch,...
271250  0.011561  (tensorflow, iiot, linux, datascienc, pytorch,...

[271251 rows x 2 columns]
         support                                           itemsets
237     0.506744                                            (robot)
218     0.448940     

In [None]:
#Step 8: Document features builder: This code provides measures of interest to further understand the qualities of the content within the tweets.
#Particularly, this code will provide measures such as: Antecedent Support, Consequent Support, Lift, Leverage and Conviction.


#from mlxtend.preprocessing import TransactionEncoder
#from mlxtend.frequent_patterns import apriori
#from mlxtend.frequent_patterns import association_rules

#df_ar = association_rules(df1, metric = "confidence", min_threshold = 0.5)
#print(df_ar)