## Explore tweets about Rafael Nadal

### Introduction

The dataset we are using here is collected using Twitter API, tweepy and python package. The hashtag used to filter the tweet is #rafaelnadal.

### Data Preparation

#### Load Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS
import warnings
warnings.simplefilter('ignore')

In [2]:
tweets_df = pd.read_csv('/content/drive/MyDrive/Machine Learning Projects/Rafael Nadal Tweets/rafaelnadal_tweets.csv')

### Data Exploration

#### Glimpse the data

In [4]:
print(f'data shape: {tweets_df.shape}')

data shape: (8759, 13)


In [6]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8759 entries, 0 to 8758
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   user_name         8759 non-null   object
 1   user_location     6204 non-null   object
 2   user_description  7823 non-null   object
 3   user_created      8759 non-null   object
 4   user_followers    8759 non-null   int64 
 5   user_friends      8759 non-null   int64 
 6   user_favourites   8759 non-null   int64 
 7   user_verified     8759 non-null   bool  
 8   date              8759 non-null   object
 9   text              8759 non-null   object
 10  hashtags          6987 non-null   object
 11  source            8759 non-null   object
 12  is_retweet        8759 non-null   bool  
dtypes: bool(2), int64(3), object(8)
memory usage: 770.0+ KB


In [7]:
tweets_df.describe()

Unnamed: 0,user_followers,user_friends,user_favourites
count,8759.0,8759.0,8759.0
mean,9883.7,1002.61354,24823.08
std,238158.8,3824.312986,49799.04
min,0.0,0.0,0.0
25%,65.0,164.0,1258.5
50%,249.0,408.0,7047.0
75%,1052.0,985.5,27943.0
max,14444030.0,191548.0,1083727.0


In [8]:
tweets_df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,Nong Nhat Minh,,,2021-11-26 07:07:47,20,890,1035,False,2022-06-08 17:02:44,@DappCensus Nice project. @linhair8 @LongAirdr...,"['dappcensus', 'Airdrop', 'BNB', 'giveaway', '...",Twitter Web App,False
1,Peter Ndoro,Africa,Broadcast Journalist | This is not a News Feed...,2009-03-22 16:29:58,279853,191548,5787,True,2022-06-08 16:52:04,The champions are being born everyday. They ar...,,Twitter for iPhone,False
2,Gurpreet Singh,Mansa,https://t.co/2zAmCdu2Jh,2019-05-17 16:33:12,61,1214,2727,False,2022-06-08 16:43:24,@DappCensus 🤩\n Successful in 2022\nBig profit...,,Twitter for Android,False
3,💯 Earning Tips💰💰,"Dhaka, Bangladesh",ARKERARMY💪,2020-08-28 08:56:58,115,2195,3716,False,2022-06-08 16:39:26,@DappCensus This is very huge and great projec...,,Twitter for Android,False
4,ahs,universe,a common man.,2012-06-08 09:23:24,35,393,21,False,2022-06-08 16:35:21,@neeteshb @RajKumarMUFC @87vintage @nadalprop ...,['Djokovic'],Twitter for iPhone,False


#### Missing data

In [14]:
def missing_data(data):
  total = data.isnull().sum()
  percent = (data.isnull().sum()/data.isnull().count()*100)
  tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
  types = []
  for col in data.columns:
    dtype = str(data[col].dtype)
    types.append(dtype)
  tt['Types'] = types
  print(tt)
  return (np.transpose(tt))

In [15]:
missing_data(tweets_df)

                  Total    Percent   Types
user_name             0   0.000000  object
user_location      2555  29.169997  object
user_description    936  10.686151  object
user_created          0   0.000000  object
user_followers        0   0.000000   int64
user_friends          0   0.000000   int64
user_favourites       0   0.000000   int64
user_verified         0   0.000000    bool
date                  0   0.000000  object
text                  0   0.000000  object
hashtags           1772  20.230620  object
source                0   0.000000  object
is_retweet            0   0.000000    bool


Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
Total,0,2555,936,0,0,0,0,0,0,0,1772,0,0
Percent,0.0,29.169997,10.686151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.23062,0.0,0.0
Types,object,object,object,object,int64,int64,int64,bool,object,object,object,object,bool


#### Unique values

In [19]:
# Exprimentation
# total = tweets_df.count()
# tt = pd.DataFrame(total)
# tt.columns = ['Total']
# tt['Total'].nunique()

4

In [22]:
def unique_values(data):
  total = data.count()
  tt = pd.DataFrame(total)
  print(tt)
  tt.columns = ['Total']
  uniques = []
  for col in data.columns:
    unique = data[col].nunique()
    uniques.append(unique)
  tt['Uniques'] = uniques
  print(tt)
  return (np.transpose(tt))

In [23]:
unique_values(tweets_df)

                     0
user_name         8759
user_location     6204
user_description  7823
user_created      8759
user_followers    8759
user_friends      8759
user_favourites   8759
user_verified     8759
date              8759
text              8759
hashtags          6987
source            8759
is_retweet        8759
                  Total  Uniques
user_name          8759     5568
user_location      6204     2217
user_description   7823     5033
user_created       8759     5633
user_followers     8759     1954
user_friends       8759     1959
user_favourites    8759     4635
user_verified      8759        2
date               8759     7693
text               8759     8701
hashtags           6987     1906
source             8759       19
is_retweet         8759        1


Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
Total,8759,6204,7823,8759,8759,8759,8759,8759,8759,8759,6987,8759,8759
Uniques,5568,2217,5033,5633,1954,1959,4635,2,7693,8701,1906,19,1
