In [41]:
from dotenv import load_dotenv
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.ml.feature import Word2Vec, Word2VecModel
import gensim
import nltk
import os
import sparknlp
import string
import tweepy
from pyspark.sql.types import *
from pyspark.sql import functions as F

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /common/home/ac1771/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
!pip3 install tweepy

Defaulting to user installation because normal site-packages is not writeable
Collecting tweepy
  Downloading tweepy-4.12.0-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 3.9 MB/s eta 0:00:01
Collecting requests-oauthlib<2,>=1.2.0
  Downloading requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB)
Installing collected packages: requests-oauthlib, tweepy
Successfully installed requests-oauthlib-1.3.1 tweepy-4.12.0


In [5]:
load_dotenv()

True

In [6]:
bearer_token = os.environ.get('bearer_token')
consumer_key = os.environ.get('consumer_key')
consumer_secret = os.environ.get('consumer_secret')
access_token = os.environ.get('access_token')
access_token_secret = os.environ.get('access_token_secret')

In [7]:
client = tweepy.Client(
    bearer_token=bearer_token,
    consumer_key=consumer_key,
    consumer_secret=consumer_secret,
    access_token=access_token,
    access_token_secret=access_token_secret,
)

In [10]:
# Get a handles Twitter ID from https://tweeterid.com/
accounts = [
    {
        'handle': 'cnnbrk',
        'id': 428333
    },
    {
        'handle': 'CNN',
        'id': 759251
    },Word2VecModel
    {
        'handle': 'nytimes',
        'id': 807095
    },
    {
        'handle': 'BBCBreaking',
        'id': 5402612
    },
    {
        'handle': 'BBCWorld',
        'id': 742143
    },
    {
        'handle': 'TheEconomist',
        'id': 5988062
    },
    {
        'handle': 'WSJ',
        'id': 3108351
    },
    {
        'handle': 'washingtonpost',
        'id': 2467791
    },pd.DataFrame
    {
        'handle': 'TIME',
        'id': 14293310
    },
    {
        'handle': 'ABC',
        'id': 28785486
    },
    {
        'handle': 'ndtv',
        'id': 37034483
    },
    {
        'handle': 'AP',
        'id': 51241574kmeans_path
    },
    {
        'handle': 'XHNews',
        'id': 487118986
    },
    {
        'handle': 'HuffPost',
        'id': 14511951
    },
    {
        'handle': 'guardian',
        'id': 87818409
    },
    {
        'handle': 'BreakingNews',
        'id': 6017542
    },
    {
        'handle': 'SkyNews',
        'id': 7587032
    },
    {
        'handle': 'AJEnglish',
        'id': 4970411
    },
    {
        'handle': 'FT',
        'id': 18949452
    },
    {
        'handle': 'SkyNewsBreak',
        'id': 87416722
    },
    {
        'handle': 'politico',
        'id': 9300262
    },
    {
        'handle': 'CNBC',
        'id': 20402945
    },
    {
        'handle': 'FRANCE24',
        'id': 1994321
    },
    {
        'handle': 'guardiannews',
        'id': 788524
    },
    {
        'handle': 'Independent',
        'id': 16973333
    },
    {
        'handle': 'BBCAfrica',
        'id': 36670025
    },
    {
        'handle': 'Newsweek',
        'id': 2884771
    },
    {
        'handle': 'Telegraph',
        'id': 16343974
    },
    {
        'handle': 'RT_com',
        'id': 64643056
    },
    {
        'handle': 'CBCNews',
        'id': 6433472
    },
    {
        'handle': 'FinancialTimes',
        'id': 4898091
    },
    {
        'handle': 'Reuters',
        'id': 1652541
    },
    {
        'handle': 'SportsCenter',
        'id': 26257166
    },
    {
        'handle': 'espn',
        'id': 2557521
    }
]

In [11]:
def get_tweets(accounts, n):
    tweets = []
    for account in accounts:
        response = client.get_usepd.DataFramers_tweets(account['id'], max_results=n)
        for tweet in response.data:
            tweets.append(tweet.text)
    return tweets

In [12]:
tweets = get_tweets(accounts, 10)

In [13]:
tokenized_words = []
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

for tweet in tweets:
    new_tweet = tweet[:tweet.index('http')] if 'http' in tweet else tweet
    new_tweet = re.sub(r'[^a-zA-Z ]+', '', new_tweet)
    tokens = word_tokenize(new_tweet)
    remove_stopwords = [lemmatize_stemming(t.lower()) for t in tokens if not t in STOPWORDS and not t in string.punctuation and len(t) > 2]
    tokenized_words.append(",".join(remove_stopwords))

In [33]:
tokenized_words[0]

'paul,pelosi,husband,house,speaker,nancy,pelosi,release,san,francisco,hospital,recover,surgery,repair,skull,fracture,injuries,hand,arm,accord,source,familiar,matter'

In [38]:
schema = StringType()
tokenized_words_df = spark.createDataFrame(tokenized_words, schema=schema)

DataFrame[value: string]

In [42]:
tokenized_words_df = tokenized_words_df.withColumn("cleaned_text", F.split(F.col("value"), ","))

In [43]:
tokenized_words_df.show()

+--------------------+--------------------+
|               value|        cleaned_text|
+--------------------+--------------------+
|paul,pelosi,husba...|[paul, pelosi, hu...|
|the,fbi,issue,war...|[the, fbi, issue,...|
|bank,england,rais...|[bank, england, r...|
|pakistans,exprime...|[pakistans, expri...|
|judge,formally,se...|[judge, formally,...|
|the,feed,make,his...|[the, feed, make,...|
|trump,lawyers,saw...|[trump, lawyers, ...|
|russia,say,resume...|[russia, say, res...|
|cvs,walgreens,wal...|[cvs, walgreens, ...|
|man,accuse,attack...|[man, accuse, att...|
|who,favorite,cnn,...|[who, favorite, c...|
|russian,flag,sign...|[russian, flag, s...|
|more,cheese,count...|[more, cheese, co...|
|elections,officia...|[elections, offic...|
|rumor,adele,isnt,...|[rumor, adele, is...|
|grocery,price,soa...|[grocery, price, ...|
|elon,musk,say,pla...|[elon, musk, say,...|
|new,power,couple,...|[new, power, coup...|
|tell,shes,expect,...|[tell, shes, expe...|
|compostable,plast...|[compostab

In [44]:
word2vec_model = Word2VecModel.load('/common/users/shared/cs543_fall22_group3/models/word2vec')

In [45]:
result = word2vec_model.transform(tokenized_words_df.drop('value'))

In [46]:
result.show()

+--------------------+--------------------+
|        cleaned_text|      output_vectors|
+--------------------+--------------------+
|[paul, pelosi, hu...|[-0.0821658222652...|
|[the, fbi, issue,...|[-0.1995822859462...|
|[bank, england, r...|[-0.4200996284683...|
|[pakistans, expri...|[-0.0837285531684...|
|[judge, formally,...|[-0.0451845757042...|
|[the, feed, make,...|[-0.4107314121133...|
|[trump, lawyers, ...|[-0.2577153867931...|
|[russia, say, res...|[-0.4622900021763...|
|[cvs, walgreens, ...|[-0.4418119501322...|
|[man, accuse, att...|[-0.0433448063319...|
|[who, favorite, c...|[-0.0398816628148...|
|[russian, flag, s...|[-0.1780601602402...|
|[more, cheese, co...|[-0.0352580488958...|
|[elections, offic...|[-0.1379379596222...|
|[rumor, adele, is...|[0.1955163978899901]|
|[grocery, price, ...|[-0.2993410425260...|
|[elon, musk, say,...|[-0.0885379875877...|
|[new, power, coup...|[-0.0224394341930...|
|[tell, shes, expe...|[-0.0606921259313...|
|[compostable, pla...|[-0.182426

In [47]:
model = KMeansModel.load('/common/users/shared/cs543_fall22_group3/models/k_means')

In [49]:
model.setPredictionCol("newPrediction")

KMeansModel: uid=KMeans_8db78f979b9c, k=5, distanceMeasure=euclidean, numFeatures=1

In [52]:
result = result.withColumn('features', F.col("output_vectors"))

In [53]:
transformed = model.transform(result)

In [55]:
all_transformed = transformed.collect()

In [61]:
for i in all_transformed:
    if i[3] == 0:
        print(i[0])
        print(i[3])
        print()

['new', 'power', 'couple', 'take', 'stage', 'miss', 'argentina', 'miss', 'puerto', 'rico', 'shock', 'delight', 'fan', 'reveal', 'theyre', 'marry']
0

['edjsandoval', 'thank', 'anarodr', 'share', 'maites', 'amaze', 'story', 'rest', 'uvalde', 'families', 'allow', 'tamir']
0

['julie', 'powell', 'die', 'week', 'possess', 'understand', 'start', 'cook', 'universal', 'experience', 'voice', 'recipe', 'sound', 'like', 'adventure', 'juliamoskin', 'write']
0

['former', 'health', 'secretary', 'matt', 'hancock', 'suspend', 'conservative', 'join', 'cast', 'itvs', 'celebrity']
0

['you', 'dont', 'know', 'try', 'kill', 'meparkland', 'survivors', 'families', 'address', 'school', 'gunman', 'sentence']
0

['paintballs', 'shoot', 'dutch', 'wolves', 'bid', 'tame']
0

['eric', 'adams', 'mayor', 'new', 'york', 'tell', 'host', 'anne', 'mcelvoy', 'plan', 'build', 'city', 'work', 'locals', 'newcomers', 'alike']
0

['the', 'stories', 'slave', 'illegally', 'bring', 'america', 'clotildathe', 'american', 'slave',