In [55]:
# Based on this tutorial:
# https://www.analyticsvidhya.com/blog/2020/04/how-to-deploy-machine-learning-model-flask/?utm_source=blog&utm_medium=10things_to_know_before_starting_data_science_project

# import required libraries, relies on pandas and sklearn
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split


In [56]:
# read the dataset
data = pd.read_csv('sample_data/twitter_sentiments.csv')
# view the top rows
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,0,factsguide: society now #motivation


In [57]:


train, test = train_test_split(data, test_size = 0.25, stratify = data['label'], random_state=21)

# get the shape of train and test split.
train.shape, test.shape

((23971, 3), (7991, 3))

So there are ~24k examples in the training dataset, with its id, label(1 if hate speech, 0 otherwise) and tweet content

In [58]:
train.head()

Unnamed: 0,id,label,tweet
27434,27435,0,lots of love and fun milleniawalk #milleniawalk #fun #family #instafun #selfie #mallingâ¦
25530,25531,0,@user ðgeek i know....its been a great day of filming. but one scene to do before i go home makes me do this ðð»ð
10107,10108,0,scott_brown-taking_drugs__out_of_my_brain-(plus16)-web-2004-ukhx_int . #web hardcore #1gabba #vk
20185,20186,0,father's day ð
9526,9527,0,@user #daddysays when he is agr is baar marks km aye to pocket money zero. smjhe :p


In [59]:
# create a TF-IDF vectorizer object, the stop_words parameter needs to be a list
# and ENGLISH_STOP_WORDS needs to be converted. It is just a list of common words to ignore.
tfidf_vectorizer = TfidfVectorizer(lowercase= True, max_features=1000, stop_words=list(ENGLISH_STOP_WORDS))



In [60]:
# fit the object with the training data tweets
tfidf_vectorizer.fit(train['tweet'])

In [61]:
# transform the train and test data, which just means these are the 
train_idf = tfidf_vectorizer.transform(train.tweet)
test_idf  = tfidf_vectorizer.transform(test.tweet)

In [62]:
train_idf.shape

(23971, 1000)

In [63]:
# create the object of LinearRegression Model
model_LR = LogisticRegression()

# fit the model with the training data
model_LR.fit(train_idf, train.label)

# predict the label on the traning data
predict_train = model_LR.predict(train_idf)

# predict the model on the test data
predict_test = model_LR.predict(test_idf)

# f1 score on train data
f1_score(y_true= train.label, y_pred= predict_train)
## >> 0.4888178913738019

f1_score(y_true= test.label, y_pred= predict_test)
## >> 0.45751633986928114

0.4289473684210526

In [64]:
# define the stages of the pipeline
pipeline = Pipeline(steps= [('tfidf', TfidfVectorizer(lowercase=True,
                                                      max_features=1000,
                                                      stop_words= list(ENGLISH_STOP_WORDS))),
                            ('model', LogisticRegression())])

# fit the pipeline model with the training data                            
pipeline.fit(train.tweet, train.label)

In [65]:
# sample tweet
text = ["Virat Kohli, AB de Villiers set to auction their 'Green Day' kits from 2016 IPL match to raise funds", "pokemon black and white"]

# predict the label using the pipeline
pipeline.predict(text)


array([0, 1])

In [66]:
# import joblib
from joblib import dump

# dump the pipeline model
dump(pipeline, filename="text_classification.joblib")

['text_classification.joblib']

In [67]:
# import joblib
from joblib import load

# sample tweet text
text = ["Virat Kohli, AB de Villiers set to auction their 'Green Day' kits from 2016 IPL match to raise funds", ""]

# load the saved pipleine model
pipeline = load("text_classification.joblib")

# predict on the sample tweet text
pipeline.predict(text)

array([0, 0])

In [68]:
!pip install praw


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [69]:
# import required libraries
import pandas as pd
pd.set_option('display.max_colwidth', 1000)
import praw

#set up reddit instance
reddit = praw.Reddit(
    client_id="fzvmD-u5jrC3Vh0jee27ig",
    client_secret="SA5UG3hFfeJ48LYIiW41-B450iJGgw",
    user_agent="python:hatespeech (by /u/326-away)",
    check_for_async=False
)

#test, print 10 submissions from hot



In [70]:
#test, print 10 submissions from hot
subreddit =  reddit.subreddit("test")
for submission in subreddit.hot(limit=10):
    print(submission.title)

test
This is a teat post
LOI First Division - Round 14 Discussion Thread / 10-05-2023
LOI First Division - Round 14 Discussion Thread / 10-05-2023
LOI First Division - Round 14 Discussion Thread / 10-05-2023
$3000 Gaming and UE5 computer. Help and discussion is greatly appreciated!
At Google I/O, generative AI gets to work
Google Cloud advances generative AI at I/O: new foundation models, embeddings, and tuning tools in Vertex AI
Introducing Duet AI for Google Cloud – an AI-powered collaborator
Announcing A3 supercomputers with NVIDIA H100 GPUs, purpose-built for AI


In [77]:
subreddit = reddit.subreddit("ubc")
comments = []
count = 0
for comment in subreddit.stream.comments():
      comments.append(comment.body)
      count += 1
      # If you've processed 20 comments, break out of the loop
      if count >= 20:
        break
pipeline.predict(comments)    

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [22]:
# list to store comments 
#comment_list = []
# Create subreddit object for desired subreddit, 
# TODO: Might wanna use the search for subreddit function
# subreddit = await reddit.subreddit('test')
# async for comment in subreddit.stream.comments(skip_existing=True):
#   print(comment)
#comment_list.append(comment)

CancelledError: ignored

In [97]:
def get_subreddit_comments(text_query):
  subreddit = reddit.subreddit(text_query)
  comments = []
  count = 0
  for comment in subreddit.stream.comments():
      comments.append(comment.body)
      count += 1
      # If you've processed 20 comments, break out of the loop
      if count >= 20:
        break
  return pipeline.predict(comments)    


In [98]:
get_subreddit_comments('test')

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [54]:
comment_list = []

subreddit =  reddit.subreddit('test')
for comment in subreddit.stream.comments(skip_existing=True):
  comment_list.append(comment.body)
pipeline.predict(comment_list)
    

KeyboardInterrupt: ignored