In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('data/processed/divvy_data_cleaned.csv')

In [None]:
df = df.convert_dtypes()
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])
df['trip_duration'] = df['ended_at'] - df['started_at']

In [None]:
df.head()

In [None]:
df.info()

In [None]:
durations = df['trip_duration']

In [None]:
minutes = []
for duration in durations:
    minutes.append(duration.total_seconds()/60)

In [None]:
df['trip_duration_minutes'] = minutes
df.info()

In [None]:
np.mean(minutes)

In [None]:
np.median(minutes)

In [None]:
np.std(minutes)

In [None]:
ptile1 = np.percentile(minutes, 25)
ptile1

In [None]:
ptile3 = np.percentile(minutes, 75)
ptile3

In [None]:
iqr = ptile3 - ptile1
low_cutoff=ptile1 - 1.5*iqr
high_cutoff=ptile3 + 1.5*iqr

In [None]:
df_no_outliers = df[df['trip_duration_minutes'] <= high_cutoff]

In [None]:
df_no_outliers.reset_index(drop = True)

In [None]:
df_no_outliers = df_no_outliers[df_no_outliers['trip_duration_minutes'] >= low_cutoff]

In [None]:
df_no_outliers.reset_index(drop = True)
len(df_no_outliers)

In [None]:
df_no_outliers.describe()

In [None]:
(11272007 - 10439596)/11272007

In [None]:
percentiles = np.array([2.5, 25, 50, 75, 97.5])

In [None]:
ptiles_duration = np.percentile(minutes, percentiles)

In [None]:
def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""
    # Number of data points: n
    n = len(data)

    # x-data for the ECDF: x
    x = np.sort(data)

    # y-data for the ECDF: y
    y = np.arange(1, n+1) / n

    return x, y

In [None]:
x_duration, y_duration = ecdf(df['trip_duration_minutes'])

In [None]:
sns.set()
# Generate plot
_=plt.plot(x_duration, y_duration, marker = '.', linestyle = 'none')

# Label the axes
_=plt.xlabel('trip duration (minutes)')
_=plt.ylabel('ECDF')

# Overlay percentiles as red diamonds.
_ = plt.plot(ptiles_duration, percentiles/100, marker='D', color='red',
         linestyle='none')

# Display the plot
plt.show()

In [None]:
#sns.set()
_=plt.hist(df['trip_duration_minutes'], bins = 20)
_=plt.xlabel('trip duration')
_=plt.ylabel('count')
plt.show()

In [None]:
ptiles_duration = np.percentile(df_no_outliers['trip_duration_minutes'], percentiles)

In [None]:
x_duration_no_outliers, y_duration_no_outliers = ecdf(df_no_outliers['trip_duration_minutes'])

In [None]:
sns.set()
# Generate plot
_=plt.plot(x_duration_no_outliers, y_duration_no_outliers, marker = '.', linestyle = 'none')

# Label the axes
_=plt.xlabel('trip duration (minutes)')
_=plt.ylabel('ECDF')

# Overlay percentiles as red diamonds.
_ = plt.plot(ptiles_duration, percentiles/100, marker='D', color='red',
         linestyle='none')

# Display the plot
plt.show()

In [None]:
#sns.set()
_=plt.hist(df_no_outliers['trip_duration_minutes'], bins = 20)
_=plt.xlabel('trip duration')
_=plt.ylabel('count')
plt.show()

In [None]:
coords = df.iloc[:, 0:14]

In [None]:
coords.head(10)

**Delete these rides that start or end outside of Chicago limits:**

In [None]:
#heck = df[df['end_lat'] > 42.2]
check = df[df['end_lat'] < 41.5]
check.count()

In [None]:
check2 = df[df['end_lng'] < -88]
#check2 = df[df['end_lng'] > -87.4]
check2.count()

In [None]:
df['end_id'].unique().value_counts()

In [None]:
df['start_id'].unique().value_counts()

**Calculating summary statistics:**

In [None]:
df.describe()

**Comparing member vs casual rides**

In [None]:
member_rides = df[df['member_casual'] == 'member']
casual_rides = df[df['member_casual'] == 'casual']

member_rides.describe().T

In [None]:
casual_rides.describe().T

In [None]:
#histograms for each variable in df
hist = df.hist(bins=15,figsize =(20,10), xrot = 45)
plt.subplots_adjust(hspace=1.5, wspace=0.4)

In [None]:
#create a boxplot for every column in df
boxplot = df.boxplot(grid=False, vert=False,fontsize=15)

In [None]:
#create a boxplot for every column in df
boxplot = df_no_outliers.boxplot(grid=False, vert=False,fontsize=15)

In [None]:
#pair plots
g = sns.pairplot(df)