In [1]:
import tweepy
import config

In [2]:
twitter_auth = tweepy.OAuthHandler(config.consumer_key, config.consumer_key_secret)
api = tweepy.API(twitter_auth)

tweets = api.search("data", lang='en', count=50) # Search term goes here

print(tweets)

[Status(_api=<tweepy.api.API object at 0x000002A5D59FBA48>, _json={'created_at': 'Sun Jul 12 20:34:40 +0000 2020', 'id': 1282413088181026818, 'id_str': '1282413088181026818', 'text': 'RT @ASlavitt: BREAKING: This chart. Devastating. \n\nDo everything you can to prevent the spread.\n\nFrom \u2066@nytimes\u2069 who sued @cdc to get this…', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'ASlavitt', 'name': 'Andy Slavitt @ 🏡', 'id': 1383272101, 'id_str': '1383272101', 'indices': [3, 12]}, {'screen_name': 'nytimes', 'name': 'The New York Times', 'id': 807095, 'id_str': '807095', 'indices': [103, 111]}, {'screen_name': 'cdc', 'name': 'David', 'id': 1532281, 'id_str': '1532281', 'indices': [122, 126]}], 'urls': []}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_

In [3]:

import re
import pandas as pd


df = pd.DataFrame(columns=('tweet_text', 'user_location', 
                           'user_followers_count', 'user_friends_count',
                           'user_account_age', 'user_verified',
                           'user_favourites_count', 'user_tweets',
                           'tweet_retweeted', 'tweet_retweet_count', 'tweet_favorite_count'))

# Remove duplicates
df.sort_values("tweet_text", inplace = True) 
df.drop_duplicates(subset ="tweet_text", keep = False, inplace = True) 

for tweet in tweets:

    df = df.append({'user_id': tweet.user.id_str,
                    'user': tweet.user.screen_name,
                    'tweet_text': tweet.text,
                    'tweet_date': tweet.created_at,
                    'user_location': tweet.user.location,
                    'user_followers_count': tweet.user.followers_count, 
                    'user_friends_count': tweet.user.friends_count,
                    'user_account_age': tweet.user.created_at, 
                    'user_verified': tweet.user.verified,
                    'user_favourites_count': tweet.user.favourites_count,
                    'user_tweets': tweet.user.statuses_count,
                    'tweet_retweeted': tweet.retweeted,
                    'tweet_retweet_count': tweet.retweet_count,
                    'tweet_favorite_count': tweet.favorite_count},
                   ignore_index=True)

df.head()

Unnamed: 0,tweet_text,user_location,user_followers_count,user_friends_count,user_account_age,user_verified,user_favourites_count,user_tweets,tweet_retweeted,tweet_retweet_count,tweet_favorite_count,tweet_date,user,user_id
0,RT @ASlavitt: BREAKING: This chart. Devastatin...,"Ottawa, Ontario",107,332,2009-05-28 02:53:51,False,24667,15734,False,139,0,2020-07-12 20:34:40,ronanpaula,43032126
1,RT @harvardmed: The COVID-19 Mobility Data Net...,"Brooklyn, NY",12075,8660,2012-02-02 01:05:50,False,644473,695934,False,5,0,2020-07-12 20:34:40,Rosenchild,480875170
2,@jbf1755 Weekend data is often unreliable.,"Winnetka, Los Angeles",333,783,2009-02-26 15:39:46,False,4549,21300,False,0,0,2020-07-12 20:34:40,esstheman,22022051
3,RT @ASlavitt: BREAKING: This chart. Devastatin...,,3077,231,2009-12-27 20:34:43,False,15460,128333,False,139,0,2020-07-12 20:34:40,itsKaitbh,99777748
4,RT @ASlavitt: EDITORIAL | The Star Tribune E...,New York,598,1060,2013-11-24 17:43:45,False,24528,286773,False,3,0,2020-07-12 20:34:39,MelodyPuppyDog,2194394673


In [None]:
df=df[['user_id', 'tweet_date', 'user', 'tweet_text']]
df.head()

In [None]:
#take our urls

df['tidy_tweet'] = df['tweet_text'].str.replace('http\S+|www.\S+', '', case=False)
df.head()

In [None]:
#Remove handles
import re
import numpy as np

def remove_pattern(input_txt, pattern):
  r = re.findall(pattern, input_txt)
  for i in r:
    input_txt = re.sub(i, '', input_txt)

  return input_txt

df['tidy_tweet'] = np.vectorize(remove_pattern)(df['tidy_tweet'], "@[\w]*")

df.head()

In [None]:
#Remove special characters, number, and punctuation

df['tidy_tweet'] = df['tidy_tweet'].str.replace("[^a-zA-Z#]", " " )

In [None]:
#Remove short words

df['tidy_tweet'] = df['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [None]:
#Tokenization, breaking up the tweets into single words

tokenized_tweet = df['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet

In [None]:
#Stemming, converting each tokenized word into its root form and tense
#Reference: https://www.datacamp.com/community/tutorials/stemming-lemmatization-python

from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])

In [None]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
    
df['stemmed_tweet'] = tokenized_tweet
df.head()

In [None]:
df2 = df.tidy_tweet.str.split(expand=True)
df2.head()

In [None]:
test_final = pd.concat([df, df2], axis=1)
test_final.head()

In [None]:
test_final.shape

In [None]:
#write results to a csv
test_final.to_csv("testing_data.csv", encoding='utf-8-sig', index=False)