## Industry Research, Data Collection & Integration.

### -> Importing Libraries.

In [1]:
import pandas as pd

### 1).  Identifying and accessing relevant data sources.

#### (i). Traffic data.

In [17]:
traffic_data = pd.read_csv("Dataset_Uber traffic.csv")
traffic_data['Datetime'] = pd.to_datetime(traffic_data['Datetime'], format='%d/%m/%y %H:%M', errors='coerce')
traffic_data.head()

Unnamed: 0,Datetime,Junction,Vehicles,ID
0,2015-11-01 00:00:00,1,15,20151101001
1,2015-11-01 01:00:00,1,13,20151101011
2,2015-11-01 02:00:00,1,10,20151101021
3,2015-11-01 03:00:00,1,7,20151101031
4,2015-11-01 04:00:00,1,9,20151101041


### (ii). Weather data.


In [3]:
weather_data = pd.read_csv("Weather.csv")
weather_data.head()

Unnamed: 0,Datetime,Temperature,Humidity,Precipitation,Windspeed
0,01/11/2015,17.0,75.6,0.0,8.3
1,02/11/2015,16.8,77.4,0.3,7.9
2,03/11/2015,16.2,78.1,0.8,8.3
3,04/11/2015,16.1,72.9,0.0,6.5
4,05/11/2015,16.7,62.9,1.5,9.0


### (iii). Event data.

In [4]:
event_data = pd.read_csv("Events.csv")
event_data.head()

Unnamed: 0,Event Name,Datetime,Location,Type,Description,Source
0,India vs South Africa ODI,25/10/2015,Mumbai (Maharashtra),Sports,Cricket match between India and South Africa,ESPN
1,Kolkata International Film Festival,10/11/2015,Kolkata (West Bengal),Cultural,Film festival featuring films from around the ...,Eventshigh
2,Diwali,11/11/2015,Nationwide,Public Holiday,Hindu festival of lights,Government Portal
3,NH7 Weekender,04/12/2015,Delhi (Delhi),Concert,Music festival featuring various artists and g...,BookMyShow
4,Bodhi Day,08/12/2015,Nationwide,Public Holiday,Buddhist holiday celebrating the enlightenment...,Government Portal


### 2). Integrating data from various sources.

#### Developing a data integration pipeline to merge Traffic, Weather, and Event data into a unified dataset, ensuring data is synchronized based on timestamps to align traffic data with corresponding weather conditions and events.

In [18]:
# Converting 'Datetime' column to datetime format
traffic_data['Datetime'] = pd.to_datetime(traffic_data['Datetime'], format='%d/%m/%Y %H:%M', errors='coerce')
weather_data['Datetime'] = pd.to_datetime(weather_data['Datetime'], format='%d/%m/%Y %H:%M', errors='coerce')
event_data['Datetime'] = pd.to_datetime(event_data['Datetime'], format='%d/%m/%Y %H:%M', errors='coerce')

# Merging datasets on 'Datetime'
merged_data = pd.merge(traffic_data, weather_data, on=['Datetime'], how='left')
Integrated_Dataset = pd.merge(merged_data, event_data, on=['Datetime'], how='left')
print(Integrated_Dataset)

                 Datetime  Junction  Vehicles           ID  Temperature  \
0     2015-11-01 00:00:00         1        15  20151101001         17.0   
1     2015-11-01 01:00:00         1        13  20151101011          NaN   
2     2015-11-01 02:00:00         1        10  20151101021          NaN   
3     2015-11-01 03:00:00         1         7  20151101031          NaN   
4     2015-11-01 04:00:00         1         9  20151101041          NaN   
...                   ...       ...       ...          ...          ...   
48133 2017-06-30 19:00:00         4        11  20170630194          NaN   
48134 2017-06-30 20:00:00         4        30  20170630204          NaN   
48135 2017-06-30 21:00:00         4        16  20170630214          NaN   
48136 2017-06-30 22:00:00         4        22  20170630224          NaN   
48137 2017-06-30 23:00:00         4        12  20170630234          NaN   

       Humidity  Precipitation  Windspeed Event Name Location Type  \
0          75.6            0.

### 3). Handling data quality issues.
#### (i). Cleaning the dataset by removing duplicates, handling missing values (imputation or removal), and correcting inconsistencies.

In [19]:
# Identify duplicate records
duplicate_mask = Integrated_Dataset.duplicated()
duplicates = Integrated_Dataset[duplicate_mask]
print(duplicates)

# Remove duplicates
Integrated_Dataset = Integrated_Dataset.drop_duplicates()

Empty DataFrame
Columns: [Datetime, Junction, Vehicles, ID, Temperature, Humidity, Precipitation, Windspeed, Event Name, Location, Type, Description, Source]
Index: []


In [21]:
# NOTE :- We will be only removing missing values from merged data of traffic and weather, as events occur ocassionaly.

# Removing rows with missing values 
new_merged_data = merged_data.dropna() 

# Reseting Index
new_merged_data = new_merged_data.reset_index(drop=True)
  
print("\nDataset after removing rows with Missing Values:\n") 
print(new_merged_data)

# Check for any remaining missing values
print("\nMissing Values Check:")
print(new_merged_data .isnull().sum())


Dataset after removing rows with Missing Values:

       Datetime  Junction  Vehicles           ID  Temperature  Humidity  \
0    2015-11-01         1        15  20151101001         17.0      75.6   
1    2015-11-02         1        14  20151102001         16.8      77.4   
2    2015-11-03         1        21  20151103001         16.2      78.1   
3    2015-11-04         1        18  20151104001         16.1      72.9   
4    2015-11-05         1        19  20151105001         16.7      62.9   
...         ...       ...       ...          ...          ...       ...   
2000 2017-06-26         4         6  20170626004         28.7      70.2   
2001 2017-06-27         4         9  20170627004         28.4      79.8   
2002 2017-06-28         4         6  20170628004         25.2      91.3   
2003 2017-06-29         4        14  20170629004         24.6      92.5   
2004 2017-06-30         4         9  20170630004         25.6      89.9   

      Precipitation  Windspeed  
0             0

In [22]:
# Merging the new_merged_dataset and event dataset. 
Integrated_Dataset = pd.merge(new_merged_data, event_data, on=['Datetime'], how='left')

# Handle missing values in the final merged dataset
# For simplicity, let's fill missing values with a placeholder 'Unknown'.
Integrated_Dataset.fillna({
    'Event Name': 'No Event',
    'Location': 'NA',
    'Type': 'NA',
    'Description': 'NA',
    'Source': 'NA'
}, inplace=True)

#Displaying Final Integrated Dataset after handling missing values
print(Integrated_Dataset)

       Datetime  Junction  Vehicles           ID  Temperature  Humidity  \
0    2015-11-01         1        15  20151101001         17.0      75.6   
1    2015-11-02         1        14  20151102001         16.8      77.4   
2    2015-11-03         1        21  20151103001         16.2      78.1   
3    2015-11-04         1        18  20151104001         16.1      72.9   
4    2015-11-05         1        19  20151105001         16.7      62.9   
...         ...       ...       ...          ...          ...       ...   
2018 2017-06-26         4         6  20170626004         28.7      70.2   
2019 2017-06-27         4         9  20170627004         28.4      79.8   
2020 2017-06-28         4         6  20170628004         25.2      91.3   
2021 2017-06-29         4        14  20170629004         24.6      92.5   
2022 2017-06-30         4         9  20170630004         25.6      89.9   

      Precipitation  Windspeed Event Name Location Type Description Source  
0             0.000   

### (ii). Normalizing or standardizing data to bring different variables to a common scale.

In [23]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Normalization using Min-Max Scaling
scaler = MinMaxScaler()
columns_to_normalize = ['Vehicles', 'Temperature', 'Humidity', 'Precipitation', 'Windspeed']
Integrated_Dataset[columns_to_normalize] = scaler.fit_transform(Integrated_Dataset[columns_to_normalize])


### ->  Saving the Final Merged dataset to CSV.

In [24]:
Integrated_Dataset.to_csv('Integrated_Dataset.csv', index=False)

### -> Importing the Intgrated Dataset.

In [25]:
df = pd.read_csv('Integrated_Dataset.csv')
df.head(60)

Unnamed: 0,Datetime,Junction,Vehicles,ID,Temperature,Humidity,Precipitation,Windspeed,Event Name,Location,Type,Description,Source
0,2015-11-01,1,0.12963,20151101001,0.371841,0.740964,0.0,0.070732,No Event,,,,
1,2015-11-02,1,0.12037,20151102001,0.364621,0.768072,0.004388,0.060976,No Event,,,,
2,2015-11-03,1,0.185185,20151103001,0.34296,0.778614,0.0117,0.070732,No Event,,,,
3,2015-11-04,1,0.157407,20151104001,0.33935,0.700301,0.0,0.026829,No Event,,,,
4,2015-11-05,1,0.166667,20151105001,0.361011,0.549699,0.021938,0.087805,No Event,,,,
5,2015-11-06,1,0.138889,20151106001,0.3213,0.618976,0.0,0.053659,No Event,,,,
6,2015-11-07,1,0.138889,20151107001,0.314079,0.701807,0.002925,0.070732,No Event,,,,
7,2015-11-08,1,0.111111,20151108001,0.3213,0.680723,0.002925,0.060976,No Event,,,,
8,2015-11-09,1,0.111111,20151109001,0.3213,0.700301,0.010238,0.053659,No Event,,,,
9,2015-11-10,1,0.166667,20151110001,0.32852,0.64759,0.021938,0.0,Kolkata International Film Festival,Kolkata (West Bengal),Cultural,Film festival featuring films from around the ...,Eventshigh
