# High-level introduction to <code>pandas</code>
For a more complete introduction to <code>pandas</code>, see [https://pandas.pydata.org/](https://pandas.pydata.org/).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Series
Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.).

In [None]:
data = np.random.randn(20)
index = range(1990, 2010)

In [None]:
print (data)
print (index)

In [None]:
y = pd.Series(data, index=index)

In [None]:
print (y)

In [None]:
salaries = {
    'gino': 1500, 'maria': 2560.34, 'luca': None, 'federico': 2451
}

In [None]:
s = pd.Series(salaries)

In [None]:
print (s)

In [None]:
k = pd.Series({
    'a': 'v', 'b': None
})

In [None]:
print (k)

### Access series as arrays

In [None]:
print (s[:2], '\n')
print (s[s > s.median()], '\n')
print (np.log(s), '\n')
print (s + s, '\n')
print (s * 3, '\n')
print (y[4:8] + y[4:10])

### Access series as dictionaries

# Data Frames
From [http://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe](http://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe)

DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects. It is generally the most commonly used pandas object. Like Series, DataFrame accepts many different kinds of input:

- Dict of 1D ndarrays, lists, dicts, or Series
- 2-D numpy.ndarray
- Structured or record ndarray
- A Series
- Another DataFrame

Along with the data, you can optionally pass index (row labels) and columns (column labels) arguments. If you pass an index and / or columns, you are guaranteeing the index and / or columns of the resulting DataFrame. Thus, a dict of Series plus a specific index will discard all data not matching up to the passed index.

If axis labels are not passed, they will be constructed from the input data based on common sense rules.

In [None]:
k = {'years': y, 'salaries': s}
df = pd.DataFrame(k)

In [None]:
print (df)

In [None]:
data = {}
for k, v in s.items():
    data[k] = {}
    for d, w in y.items():
        data[k][d] = v + (v*w)

In [None]:
ydf = pd.DataFrame(data)

In [None]:
print (ydf)

In [None]:
ydf.head()

In [None]:
pd.DataFrame.from_dict(data, orient='index').head()

## Loading and manipulating data
Retrieve the complete local dataset from <a href="https://www.kaggle.com/daveianhickey/2000-16-traffic-flow-england-scotland-wales">Kaggle website</a>.

In [None]:
accidents = '../../data/trafficsample.csv'
A = pd.read_csv(accidents, low_memory=False, index_col=0)

In [None]:
A.head()

In [None]:
A[['Date', 'Time']].head()

In [None]:
A.dtypes

In [None]:
from datetime import datetime

def todate(d, t):
    try:
        dt = datetime.strptime(" ".join([d, t]), '%d/%m/%Y %H:%M')
    except TypeError:
        dt = np.nan
    return dt

In [None]:
A['Datetime'] = [todate(x.Date, x.Time) for i, x in A.iterrows()]

In [None]:
A.shape

In [None]:
A.dtypes

## Access dataframe by index and col

In [None]:
A.iloc[2:6] # gets rows (or columns) at particular positions in the index (so it only takes integers).

In [None]:
A.loc[[85873, 340672]] # gets rows (or columns) with particular labels from the index

In [None]:
selection = A[A['Road_Surface_Conditions'] == 'Dry'].sort_values(
    'Number_of_Casualties', ascending=False)
selection[['Weather_Conditions', 'Police_Force', 
           'Accident_Severity', 'Number_of_Vehicles', 'Number_of_Casualties']].head()

In [None]:
selection[['Weather_Conditions', 'Police_Force', 'Accident_Severity', 
           'Number_of_Vehicles', 'Number_of_Casualties']].groupby('Weather_Conditions').mean()

In [None]:
sel = selection[['Weather_Conditions', 'Police_Force', 'Accident_Severity', 
           'Number_of_Vehicles', 'Number_of_Casualties', 'Datetime']]

In [None]:
sel.hist()
plt.tight_layout()
plt.show()

In [None]:
minutes = []
for i, row in sel.iterrows():
    h, m = row['Datetime'].hour, row['Datetime'].minute
    minutes.append(h*60 + m)
sel = sel.copy()
sel['Minutes'] = minutes

In [None]:
sel.hist()
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10, 4), sharey=True)
sel.boxplot(ax=axes)
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10, 10), sharey=True)
axes.scatter(selection.Longitude.values, selection.Latitude.values, alpha=0.2)
plt.show()

In [None]:
import geopandas as gpd

In [None]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

In [None]:
UK = world[world['iso_a3']=='GBR']

In [None]:
limit = 2000
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10, 10), sharey=True)
UK.plot(ax=axes, color='#CCCCCC')
axes.scatter(selection.Longitude.values[:limit], selection.Latitude.values[:limit], alpha=0.2)
plt.show()

# Example 1: k-means
Implement k-means using <code>NumPy</code> and functions