In [1]:
# Building a Sentiment Classifier using Scikit-Learn

### Importing required libraries

import json
from time import strftime
import pandas as pd
from textblob import TextBlob


import numpy as np 
import re
import nltk 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

ModuleNotFoundError: No module named 'textblob'

In [None]:
def read_json(json_file: str)->list:
    """
    json file reader to open and read json files into a list
    Args:
    -----
    json_file: str - path of a json file
    
    Returns
    -------
    length of the json file and a list of json
    """
    
    tweets_data = []
    for tweets in open(json_file,'r'):
        tweets_data.append(json.loads(tweets))

    return len(tweets_data), tweets_data

In [None]:
# read the data
class TweetDfExtractor:
    """
    this function will parse tweets json into a pandas dataframe
    
    Return
    ------
    dataframe
    """
    def __init__(self, tweets_list):
        
        self.tweets_list = tweets_list

    # an example function
    def find_statuses_count(self)->list:
        # the status count of the USER who sent the tweet
        statuses_count = []
        for tweet in self.tweets_list:
            statuses_count.append(tweet['user']['statuses_count'])

        return statuses_count
        
    def find_full_text(self)->list:
        text = []
        for tweet in self.tweets_list:
            text.append(tweet['full_text'])

        # text =  self.tweets_list['full_text']
        return text
    
    
    def find_sentiments(self, text)->list:
        polarity=[TextBlob(tweet).sentiment.polarity for  tweet in text] 
        subjectivity=[TextBlob(tweet).sentiment.subjectivity for tweet in text]
        return polarity, subjectivity

    def find_created_time(self)->list:
        #converts the time from a string to an integer
        created_at = []
        for tweet in self.tweets_list:
            created_at.append((tweet['created_at']))

        return created_at
    
    def find_source(self)->list:
        source = []
        for i in self.tweets_list:
            source.append(i['source'])
        return source
    
    def is_sensitive(self)->list:
            is_sensitive = []

            for tweet in self.tweets_list:
                if 'possibly_sensitive' in tweet.keys():
                    is_sensitive.append(tweet['possibly_sensitive'])
                else:
                    is_sensitive.append(None)
            return is_sensitive
def find_location(self)->list:
        location = []

        for tweet in self.tweets_list:
            if 'location' in tweet['user'].keys():
                location.append(tweet['user']['location'])
            else:
                location.append(None) 
        # try:
        #     location.append([self.tweets_list['user']['location']])
        # except TypeError:
        #     location = ''
        
        return location

           
    def get_tweet_df(self, save=False)->pd.DataFrame:
        #required column to be generated you should be creative and add more features
        
        # columns = ['created_at', 'source', 'original_text','polarity','subjectivity', 'lang', 
        #             'favorite_count', 'retweet_count', 'original_author', 'followers_count',
        #             'friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place']
        columns = ['created_at', 'source', 'original_text', 'polarity', 'subjectivity', 
                    'sensitivity', 'location']
        
        created_at = self.find_created_time()
        source = self.find_source()
        text = self.find_full_text()
        polarity, subjectivity = self.find_sentiments(text)
        # lang = self.find_lang()
        # fav_count = self.find_favourite_count()
        # retweet_count = self.find_retweet_count()
        # screen_name = self.find_screen_name()
        # follower_count = self.find_followers_count()
        # friends_count = self.find_friends_count()
        sensitivity = self.is_sensitive()
        # hashtags = self.find_hashtags()
        # mentions = self.find_mentions()
        location = self.find_location()
        data = zip(created_at, source, text, polarity, subjectivity,sensitivity, location) 
                    # text, polarity, subjectivity, lang, fav_count, 
                    # retweet_count, screen_name, follower_count, friends_count, sensitivity, 
                    # hashtags, mentions, location)
        df = pd.DataFrame(data=data, columns=columns)
        mysampledf = df.head(10)

        if save:
            df.to_csv('processed_tweet_data.csv', index=False)
            mysampledf.to_json('data/sample.json')
            print('Files Successfully Saved.!!!')
        
        return df

                
if __name__ == "__main__":
    # required column to be generated you should be creative and add more features
    columns = ['created_at', 'source', 'original_text','clean_text', 'sentiment','polarity',
                'subjectivity', 'lang', 'favorite_count', 'retweet_count', 'original_author', 
                'screen_count', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 
                'user_mentions', 'place', 'place_coord_boundaries']
    _, tweet_list = read_json("Desktop/TwitterDataAnalysis/data/global_twitter_data.json")


    tweet = TweetDfExtractor(tweet_list)
    tweet_df = tweet.get_tweet_df() 
    print(tweet_df.head())
