In [1]:
#import dependencies

import tweepy
import config
import re
import pandas as pd

In [2]:
twitter_auth = tweepy.OAuthHandler(config.consumer_key, config.consumer_key_secret)
api = tweepy.API(twitter_auth)

tweets = api.search("data", lang='en', count=250) # Search term goes here

print(tweets)

[Status(_api=<tweepy.api.API object at 0x000001E0B4AA08C8>, _json={'created_at': 'Sat Jul 18 00:25:46 +0000 2020', 'id': 1284283183253671943, 'id_str': '1284283183253671943', 'text': 'RT @andrewbostom: CDC created the category in the Table which combines flu/pneumonia death bec of their belief flu leading to subsequent ba…', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'andrewbostom', 'name': 'Andrew Bostom', 'id': 1465875630, 'id_str': '1465875630', 'indices': [3, 16]}], 'urls': []}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 30266529, 'id_str': '30266529', 'name': 'Recovering_Democrat', 'screen_name': 'InAbsentia9', 'location': 'Tatooine...',

In [None]:
#put tweets into a dataframe 

df = pd.DataFrame(columns=('tweet_text', 'user_location', 
                           'user_followers_count', 'user_friends_count',
                           'user_account_age', 'user_verified',
                           'user_favourites_count', 'user_tweets',
                           'tweet_retweeted', 'tweet_retweet_count', 'tweet_favorite_count'))

# Remove duplicates
df.sort_values("tweet_text", inplace = True) 
df.drop_duplicates(subset ="tweet_text", keep = False, inplace = True) 

for tweet in tweets:

    df = df.append({'user_id': tweet.user.id_str,
                    'user': tweet.user.screen_name,
                    'tweet_text': tweet.text,
                    'tweet_date': tweet.created_at,
                    'user_location': tweet.user.location,
                    'user_followers_count': tweet.user.followers_count, 
                    'user_friends_count': tweet.user.friends_count,
                    'user_account_age': tweet.user.created_at, 
                    'user_verified': tweet.user.verified,
                    'user_favourites_count': tweet.user.favourites_count,
                    'user_tweets': tweet.user.statuses_count,
                    'tweet_retweeted': tweet.retweeted,
                    'tweet_retweet_count': tweet.retweet_count,
                    'tweet_favorite_count': tweet.favorite_count},
                   ignore_index=True)

df.head()

In [None]:
df=df[['user_id', 'tweet_date', 'user', 'tweet_text']]
df.head()

In [None]:
#take our urls

df['tidy_tweet'] = df['tweet_text'].str.replace('http\S+|www.\S+', '', case=False)
df.head()

In [None]:
#Remove handles
import re
import numpy as np

def remove_pattern(input_txt, pattern):
  r = re.findall(pattern, input_txt)
  for i in r:
    input_txt = re.sub(i, '', input_txt)

  return input_txt

df['tidy_tweet'] = np.vectorize(remove_pattern)(df['tidy_tweet'], "@[\w]*")

df.head()

In [None]:
#Remove special characters, number, and punctuation

df['tidy_tweet'] = df['tidy_tweet'].str.replace("[^a-zA-Z#]", " " )

In [None]:
#Remove short words

df['tidy_tweet'] = df['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [None]:
#Tokenization, breaking up the tweets into single words

tokenized_tweet = df['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet

In [None]:
#Stemming, converting each tokenized word into its root form and tense
#Reference: https://www.datacamp.com/community/tutorials/stemming-lemmatization-python

from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])

In [None]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
    
df['stemmed_tweet'] = tokenized_tweet
df.head()

In [None]:
df2 = df.tidy_tweet.str.split(expand=True)
df2.head()

In [None]:
test_final = pd.concat([df, df2], axis=1)
test_final.head()

In [None]:
test_final.shape

In [None]:
#write results to a csv
test_final.to_csv("testing_data.csv", encoding='utf-8-sig', index=False)