# Chapter 10.16 Time Series LR

In [15]:
import pandas as pd
from scipy import stats
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

ModuleNotFoundError: No module named 'scipy'

In [16]:
nyc = pd.read_csv('1895-2023.csv')

In [17]:
c = lambda f:5/9 * (f-32)
temps = [(f, c(f)) for f in range (0, 101, 10)]

In [18]:
temps_df = pd.DataFrame(temps, columns=['Fahrenheit', 'Celsius'])
axes = temps_df.plot(x='Fahrenheit', y='Celsius', style='.-')
y_label = axes.set_ylabel('Celsius')

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

## Loading average high temperatures into a DataFrame

In [19]:
nyc.head()

Unnamed: 0,Date,Temperature,Anomaly
0,189501,29.6,-4.2
1,189601,28.4,-5.4
2,189701,29.2,-4.6
3,189801,33.8,0.0
4,189901,29.8,-4.0


In [20]:
nyc.tail()

Unnamed: 0,Date,Temperature,Anomaly
124,201901,32.6,-1.2
125,202001,39.2,5.4
126,202101,34.8,1.0
127,202201,30.4,-3.4
128,202301,43.5,9.7


## Cleaning the Data

In [21]:
nyc.column = ['Date', 'Temperature', 'Anomaly']
nyc.head(3)

  nyc.column = ['Date', 'Temperature', 'Anomaly']


Unnamed: 0,Date,Temperature,Anomaly
0,189501,29.6,-4.2
1,189601,28.4,-5.4
2,189701,29.2,-4.6


Note that padas doesn't allow for columns to be created by new attributes. So in this case we can either modify the header in the file, or make a note of the data. 

In [22]:
nyc.Date.dtype


dtype('int64')

In [23]:
nyc.Date = nyc.Date.floordiv(100)
nyc.head(3)

Unnamed: 0,Date,Temperature,Anomaly
0,1895,29.6,-4.2
1,1896,28.4,-5.4
2,1897,29.2,-4.6


## Basic Descriptive Statistics for the Dataset

In [24]:
pd.set_option('display.precision', 2)

In [25]:
nyc.Temperature.describe()

count    129.00
mean      31.77
std        4.60
min       20.80
25%       28.90
50%       31.60
75%       34.60
max       43.50
Name: Temperature, dtype: float64

In [26]:
linear_regression = stats.linregress(x=nyc.Date, 
                                    y=nyc.Temperature)

NameError: name 'stats' is not defined

In [None]:
linear_regression.slope

0.029894342576028618

In [None]:
linear_regression.intercept

-26.79790082737029

In [None]:
linear_regression.slope * 2026 + linear_regression.intercept

33.76803723166369

In [None]:
linear_regression.slope * 1890 + linear_regression.intercept

29.702406641323797

## Creating a Plot:

In [None]:
sns.set_style('whitegrid')
axes = sns.regplot(x=nyc.Date, y=nyc.Temperature)
axes.set_ylim(10, 70)
(10, 70)

(10, 70)

## Splitting the Data for Training and Testing:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(nyc.Date.values.reshape(-1, 1), nyc.Temperature.values, random_state=11)

In [None]:
X_train.shape

(96, 1)

In [None]:
X_test.shape

(33, 1)

## Training the Model

In [None]:
linear_regression = LinearRegression()


In [None]:
linear_regression.fit(X=X_train, y=y_train)

LinearRegression()

In [None]:
linear_regression.coef_

array([0.02654701])

In [None]:
linear_regression.intercept_

-20.50598599894774

## Testing the Model

In [None]:
predicted = linear_regression.predict(X_test)
expected = y_test
for p, e in zip(predicted[::5], expected[::5]):
    print(f'predicted: {p:.2f}, expected: {e:.2f}')

predicted: 29.80, expected: 29.60
predicted: 31.53, expected: 32.80
predicted: 31.84, expected: 34.20
predicted: 32.72, expected: 31.70
predicted: 32.80, expected: 36.90
predicted: 32.56, expected: 34.30
predicted: 30.38, expected: 31.90


## Predicting Future Temperatures:

In [None]:
predict = (lambda x: linear_regression.coef_ * x +
          linear_regression.intercept_)

In [None]:
predict(2026)

array([33.27825449])

In [None]:
predict(1890)

array([29.66786125])

## Visualizing the Dataset

In [None]:
axes = sns.scatterplot(data=nyc, x='Date', y='Temperature', hue='Temperature', palette='winter', legend=False)
axes.set_ylim(10, 70)
x = np.array([min(nyc.Date.values), max(nyc.Date.values)])
y = predict(x)
line = plt.plot(x, y)