In [1]:
import re
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

In [2]:
# Load the data from the CSV into a DataFrame
# Assume a 'dog_rates_tweets.csv' file is in the same folder as the notebook
data = pd.read_csv("./dog_rates_tweets.csv")

In [3]:
# Find tweets that contain an 'n/10' rating (because not all do)
def get_rating(tweet):
    m = re.findall('(\d+(\.\d+)?)/10', tweet)
    if m :
        return float(m[0][0])
    else :
        return "None"
    
# Extract the numeric rating
data['rating'] = data['text'].apply(get_rating)

# Exclude tweets that don't contain a rating
data.drop(data[data.rating == "None"].index, inplace=True)

In [4]:
# Remove outliers: there are a few obvious ones
# Exclude rating values that are too large to make sense (Maybe larger than 25/10?)
data.drop(data[data.rating > 25].index, inplace=True)

In [5]:
# Make sure the 'created_at' column is a datetime value, not a string
# You can either do this by applying a function that parses the string to a date
# OR by asking Pandas' read_csv function to parse dates in that column with a parse_dates argument
data['created_at'] = pd.to_datetime(data['created_at'])

In [6]:
# scipy.stats.linregress function can do a linear regression for us
# But it works on numbers, not datetime objects
# So create a timestamp column
def to_timestamp(datetime):
    timestamp = datetime.timestamp()
    return timestamp
data['timestamp'] = data['created_at'].apply(to_timestamp)

# Now use linregress to get a slope and intercept for a best fit line
x = data['timestamp']
y = data['rating'].values
y = y.astype(float)
fit = stats.linregress(x, y)

In [7]:
# Create a scatter plot of date vs rating, so you can see what the data looks like
plt.xticks(rotation=25)
plt.plot(data['created_at'].values, y, 'b.', alpha=0.5)
plt.plot(data['created_at'].values, data['timestamp']*fit.slope + fit.intercept,'r-', linewidth=3)

[<matplotlib.lines.Line2D at 0x7f4def412898>]

In [8]:
# At the end of the notbook .. 
# Show the data itself
data

Unnamed: 0,id,created_at,text,rating,timestamp
7,905222050297643008,2017-09-06 00:12:09,b'Here is a doggo before and after being prese...,13,1.504682e+09
11,905175402502660096,2017-09-05 21:06:47,b'RUPERT OMG I LOVE YOU TOO DO NOT BE DISCOURA...,12,1.504671e+09
19,905098956430086144,2017-09-05 16:03:01,b'Say hello to Rush. He discovered a secret sn...,12,1.504653e+09
22,905079268476145665,2017-09-05 14:44:47,b'THIS \xf0\x9f\x91\x8f IS \xf0\x9f\x91\x8f WH...,13,1.504648e+09
33,904495094014861312,2017-09-04 00:03:29,"b""This is Lucy. She's trying to learn how to s...",12,1.504509e+09
38,904363433650515968,2017-09-03 15:20:19,b'RT @GadGooner: 13/10 would park on this leve...,13,1.504477e+09
45,904128876116410369,2017-09-02 23:48:16,"b""This is Sawyer. He discovered a way to make ...",13,1.504421e+09
61,903288181222772736,2017-08-31 16:07:39,"b""This is Bentley. He's very puptective of his...",13,1.504221e+09
66,903047250515025922,2017-08-31 00:10:16,b'Meet Lily. This is where she goes when the s...,13,1.504163e+09
71,902923889638072321,2017-08-30 16:00:05,b'This is Misha. She waves goodbye to her owne...,14,1.504134e+09


In [None]:
# ... the slope and intercept of the best-fit line ...
fit.slope, fit.intercept

In [None]:
# ... and a scatterplot with fit line
plt.show()