In [1]:
import time
import pandas as pd
import numpy as np
import os

CITY_DATA = { 'chicago': 'chicago.csv',
              'new york city': 'new_york_city.csv',
              'washington': 'washington.csv' }

def get_filters():
    """
    Asks user to specify a city, month, and day to analyze.

    Returns:
        (str) city - name of the city to analyze
        (str) month - name of the month to filter by, or "all" to apply no month filter
        (str) day - name of the day of week to filter by, or "all" to apply no day filter
    """
    print('-'*40)
    print('Hello! Let\'s explore some US bikeshare data!')
    # initialize return values to empty strings
    city, month, day = '', '', '' 
    
    # loop until the entered city name is valid
    cities = ['chicago','new york city','washington']
    while city not in cities:
        city = input('Please enter one of the following cities: \'chicago\', \'new york city\', \'washington\' \n')
        city = city.lower().strip() # cast to lowercase and strip leading and trailing white space
        if city not in cities:
            print(city + ' is an invalid entry for city. \n')
    print(city + ' selected!')

    # get user input for month (all, january, february, ... , june)
    months = ['all','january','february','march','april','may','june']
    while month not in months:
        month = input('Please specify the month of the calendar year (up until June) or enter \'all\' to select all months: \n')
        month = month.lower().strip() # cast to lowercase and strip leading and trailing white space
        if month not in months:
            print(month + 'is an invalid entry for month. \n')
    print(month + ' selected!')

    # get user input for day of week (all, monday, tuesday, ... sunday)
    days = ['monday','tuesday','wednesday','thursday','friday','saturday','sunday','all']
    while day not in days:
        day = input('Please specify the day of the week or enter \'all\' to select all days of the week: \n')
        day = day.lower().strip() # cast to lowercase and strip leading and trailing white space
        if day not in days:
            print(day + 'is an invalid entry for day of the week. \n')
    print(day + ' selected!')

    print('-'*40)
    return city, month, day
        

def load_data(city, month, day):
    """
    Loads data for the specified city and filters by month and day if applicable.

    Args:
        (str) city - name of the city to analyze
        (str) month - name of the month to filter by, or "all" to apply no month filter
        (str) day - name of the day of week to filter by, or "all" to apply no day filter
    Returns:
        df - Pandas DataFrame containing city data filtered by month and day
    """
    df = None
    while df is None:
        try:
            f = os.path.join(os.getcwd(), CITY_DATA[city]) # define file path of dataset for user-entered city
            df = pd.read_csv(f, dtype=str) # Read in every field as a string initially so we can define each of our field types otherwise
            
            # Format data type of fields
            for col in df.columns.values:
                if 'Time' in col:
                    df[col] = pd.to_datetime(df[col], errors='coerce')
                if col == 'Trip Duration':
                    df[col] = pd.to_numeric(df[col], errors='coerce')
                if col == 'Birth Year':
                    df[col] = pd.to_numeric(df[col], errors='coerce')

            
            # Note: the start day and end day are always the same day; no dataset contains an instance of overnight travel
            #       --as such, we only need to worry about extracting the month and day from one of the start/end time stamp fields
            df['Month'] = pd.DatetimeIndex(df['Start Time']).month_name().str.lower()
            df['Day'] = pd.DatetimeIndex(df['Start Time']).day_name().str.lower()
            df['Start Hour'] = pd.DatetimeIndex(df['Start Time']).strftime('%H')
            
            # Drop our carried in index with useless values
            df = df.drop(columns=['Unnamed: 0'])
            
            # Condition on user entered filters
            if (month == day == 'all'):
                return df.reset_index(drop=True)
            elif (month != 'all') & (day == 'all'):
                return df.loc[df['Month'] == month].reset_index(drop=True)
            elif (month == 'all') & (day != 'all'):
                return df.loc[df['Day'] == day].reset_index(drop=True)
            else:
                return df.loc[(df['Day'] == day) & (df['Month'] == month)].reset_index(drop=True)
            
        except FileNotFoundError:
            print('File ' + f + ' does not exist. Program is exiting and must be restarted after file has been saved to the referenced file path.' )
            os._exit(os.EX_OK)
            
def view_data(df):
    """
    Asks user if they would like to view the first 5 rows of data and each successive set of 5 rows in the dataset
    
    Args:
        (Pandas Dataframe) df - dataframe being viewed/analyzed by the user
    """
    s = input('\nWould you like to see the first 5 rows of data? Enter yes or anything else for no.\n')
    rowCount = 0
    while s.lower() == 'yes':
        if (rowCount <= df.shape[0]):
            print(df.iloc[rowCount:rowCount + 5])
            rowCount += 5
            s = input('\nWould you like to see the next 5 rows of data? Enter yes or anything else for no.\n')
        else:
            print('\n User has viewed all data in the dataset. \n')
            break
        if s.lower() != 'yes':
            print('\n User has exited viewing the dataset 5 records at a time. \n')
            break
    
    
def time_stats(df):
    """Displays statistics on the most frequent times of travel."""
    if df.shape[0] > 0:
        print('\nCalculating The Most Frequent Times of Travel...\n')
        start_time = time.time()

        # display the most common month
        print('Most commonly traveled month: ' + str(df['Month'].mode()[0]))

        # display the most common day of week
        print('Most commonly traveled day of the week: ' + str(df['Day'].mode()[0]))

        # display the most common start hour
        print('Most commonly traveled starting hour: ' + str(df['Start Hour'].mode()[0]) + ':00')

        print("\nThis took %s seconds." % (time.time() - start_time))
        print('-'*40)
        
        view_data(df)
    else:
        print('There is no available data to compute time stats for the combination of city, month, and day which were entered. \n')
        print('-'*40)

def station_stats(df):
    """Displays statistics on the most popular stations and trip."""
    if df.shape[0] > 0:
        print('\nCalculating The Most Popular Stations and Trip...\n')
        start_time = time.time()

        # display most commonly used start station
        print('Most commonly used Start Station: ' + str(df['Start Station'].mode()[0]))

        # display most commonly used end station
        print('Most commonly used End Station: ' + str(df['End Station'].mode()[0]))

        # display most frequent combination of start station and end station trip
        startStation, endStation = (df['Start Station'] + '-' + df['End Station']).mode()[0].split('-')
        print('Most common trip taken: ' + startStation + ' to ' + endStation)

        print("\nThis took %s seconds." % (time.time() - start_time))
        print('-'*40)
        
        view_data(df)
    else:
        print('There is no available data to compute station stats for the combination of city, month, and day which were entered. \n')
        print('-'*40)

def trip_duration_stats(df):
    """Displays statistics on the total and average trip duration."""
    if df.shape[0] > 0:
        print('\nCalculating Trip Duration...\n')
        start_time = time.time()

        # display total travel time
        print('Total trip duration: ' + str(df['Trip Duration'].sum()) + ' minutes')

        # display mean travel time
        print('Mean trip duration: ' + str(round(df['Trip Duration'].mean(),0))[:-2] + ' minutes')

        print("\nThis took %s seconds." % (time.time() - start_time))
        print('-'*40)
        
        view_data(df)
    else:
        print('There is no available data to compute trip duration stats for the combination of city, month, and day which were entered. \n')
        print('-'*40)
        
def user_stats(df):
    """Displays statistics on bikeshare users."""
    if df.shape[0] > 0:
        print('\nCalculating User Stats...\n')
        start_time = time.time()
        
        colNames = df.columns.values
        if 'User Type' in colNames:
            # Display counts of user types
            print('User Type Counts: \n')
            print(df.groupby(by=['User Type']).size())
            print('\n')
        else:
            print('Column \'User Type\' not found in dataset. \n')

        if 'Gender' in colNames:
            # Display counts of gender
            print('Gender Counts: \n')
            print(df.groupby(by=['Gender']).size())
            print('\n')
        else:
            print('Column \'Gender\' not found in dataset. \n')

        if 'Birth Year' in colNames:
            # Display earliest, most recent, and most common year of birth
            print('Oldest Passenger\'s Year of Birth: ' + str(round(df['Birth Year'].min(),0))[:-2])
            print('Youngest Passenger\'s Year of Birth: ' + str(round(df['Birth Year'].max(),0))[:-2])
            print('Most common Year of Birth: ' + str(round(df['Birth Year'].mode()[0],0))[:-2])
        else:
            print('Column \'Birth Year\' not found in dataset. \n')

        print("\nThis took %s seconds." % (time.time() - start_time))
        print('-'*40)
        
        view_data(df)
    else:
        print('There is no available data to compute trip duration stats for the combination of city, month, and day which were entered. \n')
        print('-'*40)

def main():
    while True:
        city, month, day = get_filters()
        df = load_data(city, month, day)

        time_stats(df)
        station_stats(df)
        trip_duration_stats(df)
        user_stats(df)

        restart = input('\nWould you like to restart? Enter yes or anything else for no.\n')
        if restart.lower() != 'yes':
            break


if __name__ == "__main__":
	main()


----------------------------------------
Hello! Let's explore some US bikeshare data!
Please enter one of the following cities: 'chicago', 'new york city', 'washington' 
chicago
chicago selected!
Please specify the month of the calendar year (up until June) or enter 'all' to select all months: 
march
march selected!
Please specify the day of the week or enter 'all' to select all days of the week: 
all
all selected!
----------------------------------------

Calculating The Most Frequent Times of Travel...

Most commonly traveled month: march
Most commonly traveled day of the week: friday
Most commonly traveled starting hour: 17:00

This took 0.01252603530883789 seconds.
----------------------------------------

Would you like to see the first 5 rows of data? Enter yes or anything else for no.
yes
           Start Time            End Time  Trip Duration  \
0 2017-03-06 13:49:38 2017-03-06 13:55:28            350   
1 2017-03-23 09:38:27 2017-03-23 09:42:41            254   
2 2017-03-23 


Would you like to see the next 5 rows of data? Enter yes or anything else for no.
yes
            Start Time            End Time  Trip Duration  \
10 2017-03-09 13:08:17 2017-03-09 13:14:22            365   
11 2017-03-15 07:30:23 2017-03-15 07:36:22            359   
12 2017-03-29 07:27:35 2017-03-29 07:33:37            362   
13 2017-03-05 13:21:22 2017-03-05 13:33:15            713   
14 2017-03-24 15:10:29 2017-03-24 15:19:44            555   

                   Start Station                   End Station   User Type  \
10  Financial Pl & Congress Pkwy   Michigan Ave & Jackson Blvd  Subscriber   
11       Wood St & Milwaukee Ave  Marshfield Ave & Cortland St  Subscriber   
12      LaSalle St & Illinois St      Fairbanks Ct & Grand Ave  Subscriber   
13      Kingsbury St & Kinzie St       Aberdeen St & Monroe St  Subscriber   
14       Franklin St & Monroe St       Aberdeen St & Monroe St  Subscriber   

    Gender  Birth Year  Month        Day Start Hour  
10    Male      1955.0 

In [21]:
import pandas as pd
import os

df = pd.read_csv(os.path.join(os.getcwd(), 'chicago.csv'))
df['Month'] = pd.DatetimeIndex(df['Start Time']).month_name().str.lower()
df['Day'] = pd.DatetimeIndex(df['Start Time']).day_name().str.lower()
df['Start Hour'] = pd.DatetimeIndex(df['Start Time']).strftime('%H')
df.head()

Unnamed: 0.1,Unnamed: 0,Start Time,End Time,Trip Duration,Start Station,End Station,User Type,Gender,Birth Year,Month,Day,Start Hour
0,1423854,2017-06-23 15:09:32,2017-06-23 15:14:53,321,Wood St & Hubbard St,Damen Ave & Chicago Ave,Subscriber,Male,1992.0,june,friday,15
1,955915,2017-05-25 18:19:03,2017-05-25 18:45:53,1610,Theater on the Lake,Sheffield Ave & Waveland Ave,Subscriber,Female,1992.0,may,thursday,18
2,9031,2017-01-04 08:27:49,2017-01-04 08:34:45,416,May St & Taylor St,Wood St & Taylor St,Subscriber,Male,1981.0,january,wednesday,8
3,304487,2017-03-06 13:49:38,2017-03-06 13:55:28,350,Christiana Ave & Lawrence Ave,St. Louis Ave & Balmoral Ave,Subscriber,Male,1986.0,march,monday,13
4,45207,2017-01-17 14:53:07,2017-01-17 15:02:01,534,Clark St & Randolph St,Desplaines St & Jackson Blvd,Subscriber,Male,1975.0,january,tuesday,14


In [22]:
df.Month.unique()

array(['june', 'may', 'january', 'march', 'april', 'february'],
      dtype=object)

In [10]:
df2 = pd.read_csv(os.path.join(os.getcwd(), 'new_york_city.csv'))
df2['Month'] = pd.DatetimeIndex(df2['Start Time']).month_name().str.lower()
df2['Day'] = pd.DatetimeIndex(df2['Start Time']).day_name().str.lower()
df2['Start Hour'] = pd.DatetimeIndex(df2['Start Time']).strftime('%H')
df2.head()

Unnamed: 0.1,Unnamed: 0,Start Time,End Time,Trip Duration,Start Station,End Station,User Type,Gender,Birth Year,Month,Day,Start Hour
0,5688089,2017-06-11 14:55:05,2017-06-11 15:08:21,795,Suffolk St & Stanton St,W Broadway & Spring St,Subscriber,Male,1998.0,june,sunday,14
1,4096714,2017-05-11 15:30:11,2017-05-11 15:41:43,692,Lexington Ave & E 63 St,1 Ave & E 78 St,Subscriber,Male,1981.0,may,thursday,15
2,2173887,2017-03-29 13:26:26,2017-03-29 13:48:31,1325,1 Pl & Clinton St,Henry St & Degraw St,Subscriber,Male,1987.0,march,wednesday,13
3,3945638,2017-05-08 19:47:18,2017-05-08 19:59:01,703,Barrow St & Hudson St,W 20 St & 8 Ave,Subscriber,Female,1986.0,may,monday,19
4,6208972,2017-06-21 07:49:16,2017-06-21 07:54:46,329,1 Ave & E 44 St,E 53 St & 3 Ave,Subscriber,Male,1992.0,june,wednesday,7


In [4]:
df3 = pd.read_csv(os.path.join(os.getcwd(), 'washington.csv'))
df3['Month'] = pd.DatetimeIndex(df3['Start Time']).month_name().str.lower()
df3['Day'] = pd.DatetimeIndex(df3['Start Time']).day_name().str.lower()
df3['Start Hour'] = pd.DatetimeIndex(df3['Start Time']).strftime('%H')
df3.head()

Unnamed: 0.1,Unnamed: 0,Start Time,End Time,Trip Duration,Start Station,End Station,User Type,Month,Day,Start Hour
0,1621326,2017-06-21 08:36:34,2017-06-21 08:44:43,489.066,14th & Belmont St NW,15th & K St NW,Subscriber,june,wednesday,8
1,482740,2017-03-11 10:40:00,2017-03-11 10:46:00,402.549,Yuma St & Tenley Circle NW,Connecticut Ave & Yuma St NW,Subscriber,march,saturday,10
2,1330037,2017-05-30 01:02:59,2017-05-30 01:13:37,637.251,17th St & Massachusetts Ave NW,5th & K St NW,Subscriber,may,tuesday,1
3,665458,2017-04-02 07:48:35,2017-04-02 08:19:03,1827.341,Constitution Ave & 2nd St NW/DOL,M St & Pennsylvania Ave NW,Customer,april,sunday,7
4,1481135,2017-06-10 08:36:28,2017-06-10 09:02:17,1549.427,Henry Bacon Dr & Lincoln Memorial Circle NW,Maine Ave & 7th St SW,Subscriber,june,saturday,8


In [5]:
df3.Month.unique()

array(['june', 'march', 'may', 'april', 'february', 'january'],
      dtype=object)

In [None]:
df3['Month'] = pd.DatetimeIndex(df3['Start Time']).month_name().str.lower()

In [31]:
df3.loc[df3['Month'] == 'june']

Unnamed: 0.1,Unnamed: 0,Start Time,End Time,Trip Duration,Start Station,End Station,User Type,Gender,Birth Year,Month,Day,Start Hour
0,5688089,2017-06-11 14:55:05,2017-06-11 15:08:21,795,Suffolk St & Stanton St,W Broadway & Spring St,Subscriber,Male,1998.0,june,sunday,14
4,6208972,2017-06-21 07:49:16,2017-06-21 07:54:46,329,1 Ave & E 44 St,E 53 St & 3 Ave,Subscriber,Male,1992.0,june,wednesday,07
17,6388534,2017-06-23 21:21:59,2017-06-23 21:30:45,525,E 2 St & Avenue C,E 11 St & 2 Ave,Subscriber,Female,1997.0,june,friday,21
29,5383277,2017-06-06 11:23:30,2017-06-06 11:26:56,205,W 43 St & 10 Ave,9 Ave & W 45 St,Subscriber,Male,1951.0,june,tuesday,11
34,6321417,2017-06-22 18:52:41,2017-06-22 19:15:50,1388,E 55 St & 3 Ave,Milton St & Franklin St,Subscriber,Male,1988.0,june,thursday,18
...,...,...,...,...,...,...,...,...,...,...,...,...
299976,6392528,2017-06-24 02:59:07,2017-06-24 03:19:11,1203,W 13 St & 6 Ave,Broadway & Battery Pl,Subscriber,Male,1992.0,june,saturday,02
299977,5120176,2017-06-01 15:58:49,2017-06-01 16:21:26,1357,St James Pl & Pearl St,Cadman Plaza E & Tillary St,Customer,,,june,thursday,15
299980,5796961,2017-06-13 13:05:57,2017-06-13 13:32:01,1563,W 45 St & 8 Ave,West St & Chambers St,Customer,,,june,tuesday,13
299983,5131279,2017-06-01 17:56:41,2017-06-01 18:02:07,326,Pier 40 - Hudson River Park,West St & Chambers St,Subscriber,Female,1988.0,june,thursday,17


In [15]:
df3['Month'].mode()[0]

'june'

In [39]:
df['start month'] = pd.DatetimeIndex(df['Start Time']).month_name().str.lower()
df['end month'] = pd.DatetimeIndex(df['Start Time']).month_name().str.lower()
df['start day'] = pd.DatetimeIndex(df['Start Time']).day_name().str.lower()
df['end day'] = pd.DatetimeIndex(df['Start Time']).day_name().str.lower()
df['Start Hour'] = pd.DatetimeIndex(df['Start Time']).strftime('%H')

In [6]:
df.groupby(by=['User Type']).size()

User Type
Customer       61110
Dependent          1
Subscriber    238889
dtype: int64

In [53]:
a

'Lake Shore Dr & Monroe St'