## Industry Research, Data Collection & Integration.

### -> Importing Libraries.

In [2]:
import pandas as pd

### 1).  Identifying and accessing relevant data sources.

#### (i). Traffic data.

In [3]:
traffic_data = pd.read_csv('Dataset_Uber traffic.csv',
                           index_col='Datetime', 
                         parse_dates=['Datetime'], 
                         date_format='%d/%m/%y %H:%M')
traffic_data

Unnamed: 0_level_0,Junction,Vehicles,ID
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-11-01 00:00:00,1,15,20151101001
2015-11-01 01:00:00,1,13,20151101011
2015-11-01 02:00:00,1,10,20151101021
2015-11-01 03:00:00,1,7,20151101031
2015-11-01 04:00:00,1,9,20151101041
...,...,...,...
2017-06-30 19:00:00,4,11,20170630194
2017-06-30 20:00:00,4,30,20170630204
2017-06-30 21:00:00,4,16,20170630214
2017-06-30 22:00:00,4,22,20170630224


### (ii). Weather data.


In [4]:
weather_data = pd.read_csv("Weather.csv",
                           index_col='Datetime', 
                         parse_dates=['Datetime'], 
                         date_format='%Y-%m-%d')
weather_data

Unnamed: 0_level_0,Temperature,Humidity,Precipitation,Windspeed
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-11-01,17.0,75.6,0.000,8.3
2015-11-02,16.8,77.4,0.300,7.9
2015-11-03,16.2,78.1,0.800,8.3
2015-11-04,16.1,72.9,0.000,6.5
2015-11-05,16.7,62.9,1.500,9.0
...,...,...,...,...
2017-06-27,28.4,79.8,5.344,22.3
2017-06-28,25.2,91.3,19.666,27.7
2017-06-29,24.6,92.5,4.591,13.0
2017-06-30,25.6,89.9,0.043,16.6


### (iii). Event data.

In [5]:
event_data = pd.read_csv("events.csv",
                         index_col='Datetime', 
                         parse_dates=['Datetime'], 
                         date_format='%Y-%m-%d')
event_data

Unnamed: 0_level_0,Event Name,Location,Type,Description
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-11-01,Diwali Festival,Pan India,Public Holiday,Annual Hindu festival of lights
2015-11-02,Pro Kabaddi League Final,Hyderabad,Sports,Final match of the Pro Kabaddi League Season 2
2015-11-08,Coldplay Concert,Mumbai,Concert,Live performance by the band Coldplay
2015-11-25,Guru Nanak Jayanti,Amritsar,Public Holiday,Birthday of Guru Nanak(founder of Sikhism)
2015-12-01,Cricket Test Match,Delhi,Sports,Test match between India and South Africa
...,...,...,...,...
2017-05-19,Pro Kabaddi League Match,Bengaluru,Sports,Professional Kabaddi league match
2017-06-05,Cricket Test Match,Chennai,Sports,Test match between India and Bangladesh
2017-06-10,Ed Sheeran Concert,Chennai,Concert,Live performance by the singer Ed Sheeran
2017-06-15,Pro Kabaddi League Match,Mumbai,Sports,Professional Kabaddi league match


### 2). Integrating data from various sources.

#### Developing a data integration pipeline to merge Traffic, Weather, and Event data into a unified dataset, ensuring data is synchronized based on timestamps to align traffic data with corresponding weather conditions and events.

In [6]:
# Resample the weather data to hourly frequency
weather_data = weather_data.resample('H').ffill()

# Resample the weather data to hourly frequency
event_data = event_data.resample('H').ffill()

# Merging datasets on 'Datetime'
merge_data = pd.merge(traffic_data, weather_data, on='Datetime', how='left')
Integrated_Dataset = pd.merge(merge_data, event_data, on='Datetime', how='left')

print(Integrated_Dataset)

                     Junction  Vehicles           ID  Temperature  Humidity  \
Datetime                                                                      
2015-11-01 00:00:00         1        15  20151101001         17.0      75.6   
2015-11-01 01:00:00         1        13  20151101011         17.0      75.6   
2015-11-01 02:00:00         1        10  20151101021         17.0      75.6   
2015-11-01 03:00:00         1         7  20151101031         17.0      75.6   
2015-11-01 04:00:00         1         9  20151101041         17.0      75.6   
...                       ...       ...          ...          ...       ...   
2017-06-30 19:00:00         4        11  20170630194         25.6      89.9   
2017-06-30 20:00:00         4        30  20170630204         25.6      89.9   
2017-06-30 21:00:00         4        16  20170630214         25.6      89.9   
2017-06-30 22:00:00         4        22  20170630224         25.6      89.9   
2017-06-30 23:00:00         4        12  20170630234

### 3). Handling data quality issues.
#### (i). Cleaning the dataset by removing duplicates, handling missing values (imputation or removal), and correcting inconsistencies.

In [7]:
# Identify duplicate records
duplicate_mask = Integrated_Dataset.duplicated()
duplicates = Integrated_Dataset[duplicate_mask]
print(duplicates)

# Remove duplicates
Integrated_Dataset = Integrated_Dataset.drop_duplicates()

Empty DataFrame
Columns: [Junction, Vehicles, ID, Temperature, Humidity, Precipitation, Windspeed, Event Name, Location, Type, Description]
Index: []


In [57]:
# Check for any remaining missing values
print(Integrated_Dataset .isnull().sum())

# Handle missing values in the final merged dataset
# For simplicity, let's fill missing values with a placeholder 'Unknown'.
Integrated_Dataset.fillna({
    'Event Name': 'No Event',
    'Location': 'NA',
    'Type': 'NA',
    'Description': 'NA',
    'Source': 'NA'
}, inplace=True)

#Displaying Final Integrated Dataset after handling missing values
print(Integrated_Dataset)

Junction           0
Vehicles           0
ID                 0
Temperature        0
Humidity           0
Precipitation      0
Windspeed          0
Event Name       956
Location         956
Type             956
Description      956
dtype: int64
                     Junction  Vehicles           ID  Temperature  Humidity  \
Datetime                                                                      
2015-11-01 00:00:00         1        15  20151101001         17.0      75.6   
2015-11-01 01:00:00         1        13  20151101011         17.0      75.6   
2015-11-01 02:00:00         1        10  20151101021         17.0      75.6   
2015-11-01 03:00:00         1         7  20151101031         17.0      75.6   
2015-11-01 04:00:00         1         9  20151101041         17.0      75.6   
...                       ...       ...          ...          ...       ...   
2017-06-30 19:00:00         4        11  20170630194         25.6      89.9   
2017-06-30 20:00:00         4        30  2017

### (ii). Normalizing or standardizing data to bring different variables to a common scale.

In [58]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
Integrated_Dataset['vehicle_scaled'] = scaler.fit_transform(Integrated_Dataset[['Vehicles']])

# Display normalized data
print("\nNormalized Integrated Dataset :")
print(Integrated_Dataset.head(10))



Normalized Integrated Dataset :
                     Junction  Vehicles           ID  Temperature  Humidity  \
Datetime                                                                      
2015-11-01 00:00:00         1        15  20151101001         17.0      75.6   
2015-11-01 01:00:00         1        13  20151101011         17.0      75.6   
2015-11-01 02:00:00         1        10  20151101021         17.0      75.6   
2015-11-01 03:00:00         1         7  20151101031         17.0      75.6   
2015-11-01 04:00:00         1         9  20151101041         17.0      75.6   
2015-11-01 05:00:00         1         6  20151101051         17.0      75.6   
2015-11-01 06:00:00         1         9  20151101061         17.0      75.6   
2015-11-01 07:00:00         1         8  20151101071         17.0      75.6   
2015-11-01 08:00:00         1        11  20151101081         17.0      75.6   
2015-11-01 09:00:00         1        12  20151101091         17.0      75.6   

                  

### ->  Saving the Final Merged dataset to CSV.

In [61]:
Integrated_Dataset.to_csv('Integrated_Dataset.csv')

### -> Importing the Integrated Dataset.

In [62]:
df = pd.read_csv('Integrated_Dataset.csv')
df

Unnamed: 0,Datetime,Junction,Vehicles,ID,Temperature,Humidity,Precipitation,Windspeed,Event Name,Location,Type,Description,vehicle_scaled
0,2015-11-01 00:00:00,1,15,20151101001,17.0,75.6,0.000,8.3,Diwali Festival,Pan India,Public Holiday,Annual Hindu festival of lights,-0.375489
1,2015-11-01 01:00:00,1,13,20151101011,17.0,75.6,0.000,8.3,Diwali Festival,Pan India,Public Holiday,Annual Hindu festival of lights,-0.471875
2,2015-11-01 02:00:00,1,10,20151101021,17.0,75.6,0.000,8.3,Diwali Festival,Pan India,Public Holiday,Annual Hindu festival of lights,-0.616454
3,2015-11-01 03:00:00,1,7,20151101031,17.0,75.6,0.000,8.3,Diwali Festival,Pan India,Public Holiday,Annual Hindu festival of lights,-0.761034
4,2015-11-01 04:00:00,1,9,20151101041,17.0,75.6,0.000,8.3,Diwali Festival,Pan India,Public Holiday,Annual Hindu festival of lights,-0.664648
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48115,2017-06-30 19:00:00,4,11,20170630194,25.6,89.9,0.043,16.6,No Event,,,,-0.568261
48116,2017-06-30 20:00:00,4,30,20170630204,25.6,89.9,0.043,16.6,No Event,,,,0.347408
48117,2017-06-30 21:00:00,4,16,20170630214,25.6,89.9,0.043,16.6,No Event,,,,-0.327296
48118,2017-06-30 22:00:00,4,22,20170630224,25.6,89.9,0.043,16.6,No Event,,,,-0.038137
