In [1]:
!pip install prophet

Collecting prophet
  Downloading prophet-1.1.4-py3-none-win_amd64.whl (12.9 MB)
Collecting cmdstanpy>=1.0.4
  Downloading cmdstanpy-1.1.0-py3-none-any.whl (83 kB)
Collecting LunarCalendar>=0.0.9
  Downloading LunarCalendar-0.0.9-py2.py3-none-any.whl (18 kB)
Collecting holidays>=0.25
  Downloading holidays-0.29-py3-none-any.whl (695 kB)
Collecting convertdate>=2.1.2
  Using cached convertdate-2.4.0-py3-none-any.whl (47 kB)
Collecting importlib-resources
  Downloading importlib_resources-6.0.0-py3-none-any.whl (31 kB)
Collecting pymeeus<=1,>=0.3.13
  Downloading PyMeeus-0.5.12.tar.gz (5.8 MB)
Collecting ephem>=3.7.5.3
  Downloading ephem-4.1.4-cp39-cp39-win_amd64.whl (1.4 MB)
Building wheels for collected packages: pymeeus
  Building wheel for pymeeus (setup.py): started
  Building wheel for pymeeus (setup.py): finished with status 'done'
  Created wheel for pymeeus: filename=PyMeeus-0.5.12-py3-none-any.whl size=4570 sha256=5ead904f89e8faed81a0135d50e5f84f0b69306182e521da63562c7d56a84375

In [2]:
# Analysis Tools
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import norm

# Plotting Tools
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Extra Plotting Tools Required for Bar Chart Race
import matplotlib.ticker as ticker
import matplotlib.animation as animation
from IPython.display import HTML


import datetime
import calplot
 
# Prophet model 
from prophet import Prophet
from prophet.diagnostics import cross_validation
from prophet.diagnostics import performance_metrics
from prophet.plot import plot_plotly, plot_components_plotly, plot_cross_validation_metric
from prophet.serialize import model_to_json, model_from_json




# Plot Design Settings
sns.set_style("darkgrid", {"axes.facecolor": "#eff2f5", 'grid.color': '#c0ccd8', 'patch.edgecolor': '#B0B0B0', 'font.sans-serif': 'Verdana'})
sns.set_palette('Blues_d')

plt.rc('font', size=19)
plt.rc('axes', titlesize=25)
plt.rc('axes', labelsize=20)
plt.rc('xtick', labelsize=17)
plt.rc('ytick', labelsize=17)
plt.rc('figure', titlesize=24)


# Mute warnings
import warnings
warnings.filterwarnings('ignore')



In [3]:
data = pd.read_csv('city_temperature.csv')
data.shape

(2906327, 8)

In [4]:
data


Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
0,Africa,Algeria,,Algiers,1,1,1995,64.2
1,Africa,Algeria,,Algiers,1,2,1995,49.4
2,Africa,Algeria,,Algiers,1,3,1995,48.8
3,Africa,Algeria,,Algiers,1,4,1995,46.4
4,Africa,Algeria,,Algiers,1,5,1995,47.9
...,...,...,...,...,...,...,...,...
2906322,North America,US,Additional Territories,San Juan Puerto Rico,7,27,2013,82.4
2906323,North America,US,Additional Territories,San Juan Puerto Rico,7,28,2013,81.6
2906324,North America,US,Additional Territories,San Juan Puerto Rico,7,29,2013,84.2
2906325,North America,US,Additional Territories,San Juan Puerto Rico,7,30,2013,83.8


In [5]:
data.head()

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
0,Africa,Algeria,,Algiers,1,1,1995,64.2
1,Africa,Algeria,,Algiers,1,2,1995,49.4
2,Africa,Algeria,,Algiers,1,3,1995,48.8
3,Africa,Algeria,,Algiers,1,4,1995,46.4
4,Africa,Algeria,,Algiers,1,5,1995,47.9


features dtypes :

In [6]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2906327 entries, 0 to 2906326
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   Region          object 
 1   Country         object 
 2   State           object 
 3   City            object 
 4   Month           int64  
 5   Day             int64  
 6   Year            int64  
 7   AvgTemperature  float64
dtypes: float64(1), int64(3), object(4)
memory usage: 177.4+ MB


In [7]:
data['AvgTemperature'].describe()


count    2.906327e+06
mean     5.600492e+01
std      3.212359e+01
min     -9.900000e+01
25%      4.580000e+01
50%      6.250000e+01
75%      7.550000e+01
max      1.100000e+02
Name: AvgTemperature, dtype: float64

Features Unique Values :

In [8]:
# Number of unique values in each variable
for var in data:
    print(var, ': ', data[var].nunique())

Region :  7
Country :  125
State :  52
City :  321
Month :  12
Day :  32
Year :  28
AvgTemperature :  1517


Missing Values :

In [9]:
for var in data:
    print(var, ': ', data[var].isnull().sum())

Region :  0
Country :  0
State :  1450990
City :  0
Month :  0
Day :  0
Year :  0
AvgTemperature :  0


In [10]:
data = data.drop(['State'], axis=1)

Duplicates :

In [11]:
print('Number of Duplicates: {}'.format(len(data[data.duplicated()])))


Number of Duplicates: 20903


In [12]:
data = data.drop_duplicates()



Number of 'AvgTemperature' values for each year :

In [13]:
data.groupby('Year')['AvgTemperature'].count()


Year
200         60
201        235
1995    117880
1996    118210
1997    117921
1998    118334
1999    118616
2000    118946
2001    118618
2002    118601
2003    118405
2004    117910
2005    117158
2006    116910
2007    114677
2008    113818
2009    113508
2010    114526
2011    112308
2012    111256
2013    110284
2014    108284
2015    106763
2016    106499
2017    106209
2018    105657
2019    105290
2020     38541
Name: AvgTemperature, dtype: int64

In [14]:
data = data[data['Year']>1994]
data = data[data['Year']<2020]


Range of Days :

In [15]:
print('Day Range Min to Max: \n{}'.format(data['Day'].agg([min , max])))


Day Range Min to Max: 
min     0
max    31
Name: Day, dtype: int64


In [16]:
data = data[data['Day']>0]
print('1st Day: \n{}'.format(data['Day'].agg([min]))) 

1st Day: 
min    1
Name: Day, dtype: int64


Dropping all incomplete years :