# Gathering and Preprocessing of Data

In this phase , data is collected from various social media sites like Twitter and Reddit.

1. Collecting Data from twitter

In [1]:
from tweepy import API 
from tweepy import Cursor
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
 
import twitter_credentials
import numpy as np
import pandas as pd

In [7]:
# # # # TWITTER CLIENT # # # #
class TwitterClient():
    def __init__(self, twitter_user=None):
        self.auth = TwitterAuthenticator().authenticate_twitter_app()
        self.twitter_client = API(self.auth)

        self.twitter_user = twitter_user

    def get_twitter_client_api(self):
        return self.twitter_client

    def get_user_timeline_tweets(self, num_tweets):
        tweets = []
        for tweet in Cursor(self.twitter_client.user_timeline, id=self.twitter_user).items(num_tweets):
            tweets.append(tweet)
        return tweets

    def get_friend_list(self, num_friends):
        friend_list = []
        for friend in Cursor(self.twitter_client.friends, id=self.twitter_user).items(num_friends):
            friend_list.append(friend)
        return friend_list

    def get_home_timeline_tweets(self, num_tweets):
        home_timeline_tweets = []
        for tweet in Cursor(self.twitter_client.home_timeline, id=self.twitter_user).items(num_tweets):
            home_timeline_tweets.append(tweet)
        return home_timeline_tweets


# # # # TWITTER AUTHENTICATER # # # #
class TwitterAuthenticator():

    def authenticate_twitter_app(self):
        auth = OAuthHandler(twitter_credentials.CONSUMER_KEY, twitter_credentials.CONSUMER_SECRET)
        auth.set_access_token(twitter_credentials.ACCESS_TOKEN, twitter_credentials.ACCESS_TOKEN_SECRET)
        return auth

# # # # TWITTER STREAMER # # # #
class TwitterStreamer():
    """
    Class for streaming and processing live tweets.
    """
    def __init__(self):
        self.twitter_autenticator = TwitterAuthenticator()    

    def stream_tweets(self, fetched_tweets_filename, hash_tag_list):
        # This handles Twitter authetification and the connection to Twitter Streaming API
        listener = TwitterListener(fetched_tweets_filename)
        auth = self.twitter_autenticator.authenticate_twitter_app() 
        stream = Stream(auth, listener)

        # This line filter Twitter Streams to capture data by the keywords: 
        stream.filter(track=hash_tag_list)


# # # # TWITTER STREAM LISTENER # # # #
class TwitterListener(StreamListener):
    """
    This is a basic listener that just prints received tweets to stdout.
    """
    def __init__(self, fetched_tweets_filename):
        self.fetched_tweets_filename = fetched_tweets_filename

    def on_data(self, data):
        try:
            print(data)
            with open(self.fetched_tweets_filename, 'a') as tf:
                tf.write(data)
            return True
        except BaseException as e:
            print("Error on_data %s" % str(e))
        return True
          
    def on_error(self, status):
        if status == 420:
            # Returning False on_data method in case rate limit occurs.
            return False
        print(status)


class TweetAnalyzer():
    """
    Functionality for analyzing and categorizing content from tweets.
    """
    def tweets_to_data_frame(self, tweets):
        df = pd.DataFrame(data=[tweet.text for tweet in tweets], columns=['Tweets'])

        #df['id'] = np.array([tweet.id for tweet in tweets])
        #df['len'] = np.array([len(tweet.text) for tweet in tweets])
        #df['date'] = np.array([tweet.created_at for tweet in tweets])
        #df['source'] = np.array([tweet.source for tweet in tweets])
        #df['likes'] = np.array([tweet.favorite_count for tweet in tweets])
        #df['retweets'] = np.array([tweet.retweet_count for tweet in tweets])

        return df

 
if __name__ == '__main__':

    twitter_client = TwitterClient()
    tweet_analyzer = TweetAnalyzer()

    api = twitter_client.get_twitter_client_api()

    tweets = api.user_timeline(screen_name="AUSvIND", count=50)

    #print(dir(tweets[0]))
    #print(tweets[0].retweet_count)

    df = tweet_analyzer.tweets_to_data_frame(tweets)
    
    print(df.head(50))
    #df.to_pickle('tweets.pkl')
    df.to_csv('twitterdataset.csv', sep='\t', encoding='utf-8', index=False)

                                               Tweets
0   India 210 (A Kumble 45*, 70.5 ov) - Match over...
1    Wicket!! India 210 (A Kumble 45*, 70.5 ov)  #tms
2   Wicket!! India 210/9 (A Kumble 45*, 70.2 ov)  ...
3   Wicket!! India 210/8 (A Kumble 45*, 70.1 ov)  ...
4   India 200/7 (A Kumble 35*, Harbhajan Singh 7*,...
5   Wicket!! India 185/7 (A Kumble 27*, 61.2 ov)  ...
6   India 181/6 (MS Dhoni 35*, A Kumble 23*, 57.4 ...
7   India 177/6 (MS Dhoni 31*, A Kumble 23*, 54.5 ...
8   India 172/6 (MS Dhoni 31*, A Kumble 18*, 51.2 ...
9   India 161/6 (MS Dhoni 24*, A Kumble 14*, 47.5 ...
10  India 150/6 (MS Dhoni 19*, A Kumble 8*, 43.3 o...
11  Wicket!! India 137/6 (MS Dhoni 14*, 40.2 ov)  ...
12  India 131/5 (SC Ganguly 50*, MS Dhoni 9*, 38.0...
13  Wicket!! India 115/5 (SC Ganguly 43*, 33.4 ov)...
14  Wicket!! India 115/4 (SC Ganguly 43*, 33.1 ov)...
15  India 115/3 (R Dravid 38*, SC Ganguly 43*, 33....
16  India 98/3 (R Dravid 31*, SC Ganguly 33*, 27.6...
17  India 79/3 (R Dravid 27*

2. Collecting data from Reddit

In [6]:
import praw
import pandas as pd
from praw.models import MoreComments
reddit = praw.Reddit(client_id='MjOYmV5QB4Q8bw', client_secret='Ha-Bem-WhpBXOFu6sQExGW1u760', user_agent='Scraping Example')

"""
# get 10 hot posts from the MachineLearning subreddit
hot_posts = reddit.subreddit('Cricket').hot(limit=10)
for post in hot_posts:
    print(post.title)
"""
comments = []
submission = reddit.submission(id="f1zmjl")
submission.comments.replace_more(limit=0)
for top_level_comment in submission.comments:
    comments.append([top_level_comment.body])
comments = pd.DataFrame(comments,columns=['comment'])
print(comments)
#comments.to_pickle('redditdataset.pkl')
comments.to_csv('redditdataset.csv', sep='\t', encoding='utf-8', index=False)

                                               comment
0    So this is the worst series of Kohli like in y...
1                            Saini ODI average at 53 😂
2    SANTNER OVERRATED\n\nNEESHAM OUTDATED\n\nLONG ...
3    Kohli running with Shaw is going to be interes...
4                                            [deleted]
..                                                 ...
445             Guptill's playing on a different pitch
446  Guppy got all of his misses out of the way dur...
447     guptill is getting the orange cap at the t20wc
448  Remember when Munro was our opener? Mad times ...
449                Ok. Who brought the nanobots to NZ?

[450 rows x 1 columns]


# Applying TF-IDF to collected dataset

In [10]:
import csv
with open(r'redditdataset.csv') as f:
    input_data = []
    for row in csv.reader(f, delimiter=" ", quoting=csv.QUOTE_NONE):
        input_data += row
    print(input_data)

['comment', 'So', 'this', 'is', 'the', 'worst', 'series', 'of', 'Kohli', 'like', 'in', 'years.', 'Saini', 'ODI', 'average', 'at', '53', '😂', '"SANTNER', 'OVERRATED', 'NEESHAM', 'OUTDATED', 'LONG', 'HAVE', 'WE', 'AWAITED', 'LE', 'BIG', 'MAN', 'HAS', 'BEEN', 'ACTIVATED"', '"Kohli', 'running', 'with', 'Shaw', 'is', 'going', 'to', 'be', 'interesting.', 'Edit:', 'Oh', 'my', 'god!"', '[deleted]', '"New', 'bowler', ':', '*exists*', 'India', ':', 'I', 'am', 'about', 'to', 'make', 'this', "man's", 'whole', 'career."', 'Fuck', "i'm", 'excited', 'for', 'the', 'future', 'of', 'long', 'levers', 'Jamieson.', 'A', 'handful', 'to', 'face', 'and', 'his', 'cameo', 'innings', 'was', 'no', 'fluke', 'he', 'can', 'swing', 'the', 'blade.', 'Hit', 'a', 'century', 'belting', 'the', 'piss', 'out', 'of', 'a', 'near', 'full', 'strength', 'English', 'bowling', 'unit', 'in', 'a', 'warm', 'up.', '[Source](https://www.stuff.co.nz/sport/cricket/102273999/kyle-jamieson-gets-under-englands-skin-on-his-way-to-smashing-su