## Industry Research, Data Collection & Integration.

### -> Importing Libraries.

In [2]:
import pandas as pd

### 1).  Identifying and accessing relevant data sources.

#### (i). Traffic data.

In [97]:
traffic_data = pd.read_csv("Dataset_Uber traffic.csv")
traffic_data['Datetime'] = pd.to_datetime(traffic_data['Datetime'], format='%d/%m/%y %H:%M', errors='coerce')
traffic_data.tail(60)

Unnamed: 0,Datetime,Junction,Vehicles,ID
48060,2017-06-28 12:00:00,4,10,20170628124
48061,2017-06-28 13:00:00,4,14,20170628134
48062,2017-06-28 14:00:00,4,15,20170628144
48063,2017-06-28 15:00:00,4,17,20170628154
48064,2017-06-28 16:00:00,4,16,20170628164
48065,2017-06-28 17:00:00,4,14,20170628174
48066,2017-06-28 18:00:00,4,11,20170628184
48067,2017-06-28 19:00:00,4,15,20170628194
48068,2017-06-28 20:00:00,4,23,20170628204
48069,2017-06-28 21:00:00,4,17,20170628214


### (ii). Weather data.


In [65]:
weather_data = pd.read_csv("Weather.csv")
weather_data['Datetime'] = pd.to_datetime(weather_data['Datetime'], format='%d/%m/%Y', errors='coerce')
weather_data

Unnamed: 0,Datetime,Temperature,Humidity,Precipitation,Windspeed
0,2015-11-01,17.0,75.6,0.000,8.3
1,2015-11-02,16.8,77.4,0.300,7.9
2,2015-11-03,16.2,78.1,0.800,8.3
3,2015-11-04,16.1,72.9,0.000,6.5
4,2015-11-05,16.7,62.9,1.500,9.0
...,...,...,...,...,...
603,2017-06-26,28.7,70.2,0.900,24.1
604,2017-06-27,28.4,79.8,5.344,22.3
605,2017-06-28,25.2,91.3,19.666,27.7
606,2017-06-29,24.6,92.5,4.591,13.0


### (iii). Event data.

In [91]:
event_data = pd.read_csv("events.csv")
event_data['Datetime'] = pd.to_datetime(event_data['Datetime'], format='%Y-%m-%d', errors='coerce')
event_data

Unnamed: 0,Datetime,Event Name,Location,Type,Description
0,2015-11-01,Diwali Festival,Pan India,Public Holiday,Annual Hindu festival of lights
1,2015-11-02,Pro Kabaddi League Final,Hyderabad,Sports,Final match of the Pro Kabaddi League Season 2
2,2015-11-08,Coldplay Concert,Mumbai,Concert,Live performance by the band Coldplay
3,2015-11-25,Guru Nanak Jayanti,Amritsar,Public Holiday,Birthday of Guru Nanak(founder of Sikhism)
4,2015-12-01,Cricket Test Match,Delhi,Sports,Test match between India and South Africa
...,...,...,...,...,...
57,2017-05-19,Pro Kabaddi League Match,Bengaluru,Sports,Professional Kabaddi league match
58,2017-06-05,Cricket Test Match,Chennai,Sports,Test match between India and Bangladesh
59,2017-06-10,Ed Sheeran Concert,Chennai,Concert,Live performance by the singer Ed Sheeran
60,2017-06-15,Pro Kabaddi League Match,Mumbai,Sports,Professional Kabaddi league match


### 2). Integrating data from various sources.

#### Developing a data integration pipeline to merge Traffic, Weather, and Event data into a unified dataset, ensuring data is synchronized based on timestamps to align traffic data with corresponding weather conditions and events.

In [101]:
# Merging datasets on 'Datetime'
merge_data = pd.merge(traffic_data, weather_data, on=['Datetime'], how='left')
Integrated_Dataset = pd.merge(merge_data, event_data, on=['Datetime'], how='left')
print(merge_data)
print(Integrated_Dataset)

                 Datetime  Junction  Vehicles           ID  Temperature  \
0     2015-11-01 00:00:00         1        15  20151101001         17.0   
1     2015-11-01 01:00:00         1        13  20151101011          NaN   
2     2015-11-01 02:00:00         1        10  20151101021          NaN   
3     2015-11-01 03:00:00         1         7  20151101031          NaN   
4     2015-11-01 04:00:00         1         9  20151101041          NaN   
...                   ...       ...       ...          ...          ...   
48115 2017-06-30 19:00:00         4        11  20170630194          NaN   
48116 2017-06-30 20:00:00         4        30  20170630204          NaN   
48117 2017-06-30 21:00:00         4        16  20170630214          NaN   
48118 2017-06-30 22:00:00         4        22  20170630224          NaN   
48119 2017-06-30 23:00:00         4        12  20170630234          NaN   

       Humidity  Precipitation  Windspeed  
0          75.6            0.0        8.3  
1          

### 3). Handling data quality issues.
#### (i). Cleaning the dataset by removing duplicates, handling missing values (imputation or removal), and correcting inconsistencies.

In [93]:
# Identify duplicate records
duplicate_mask = Integrated_Dataset.duplicated()
duplicates = Integrated_Dataset[duplicate_mask]
print(duplicates)

# Remove duplicates
Integrated_Dataset = Integrated_Dataset.drop_duplicates()

Empty DataFrame
Columns: [Datetime, Junction, Vehicles, ID, Temperature, Humidity, Precipitation, Windspeed, Event Name, Location, Type, Description]
Index: []


In [103]:
# NOTE :- We will be only removing missing values from merged data of traffic and weather, as events occur ocassionaly.

# Removing rows with missing values 
merge_data = merge_data.dropna() 

# Reseting index
merge_data = merge_data.reset_index(drop=True)

print("\nDataset after removing rows with Missing Values:\n")
print(merge_data)

# Check for any remaining missing values
print(merge_data .isnull().sum())

Integrated_Dataset = pd.merge(merge_data, event_data, on=['Datetime'], how='left')
print(Integrated_Dataset)


Dataset after removing rows with Missing Values:

       Datetime  Junction  Vehicles           ID  Temperature  Humidity  \
0    2015-11-01         1        15  20151101001         17.0      75.6   
1    2015-11-02         1        14  20151102001         16.8      77.4   
2    2015-11-03         1        21  20151103001         16.2      78.1   
3    2015-11-04         1        18  20151104001         16.1      72.9   
4    2015-11-05         1        19  20151105001         16.7      62.9   
...         ...       ...       ...          ...          ...       ...   
2000 2017-06-26         4         6  20170626004         28.7      70.2   
2001 2017-06-27         4         9  20170627004         28.4      79.8   
2002 2017-06-28         4         6  20170628004         25.2      91.3   
2003 2017-06-29         4        14  20170629004         24.6      92.5   
2004 2017-06-30         4         9  20170630004         25.6      89.9   

      Precipitation  Windspeed  
0             0

In [112]:
# Merging the new_merged_dataset and event dataset. 
Integrated_Dataset = pd.merge(new_merged_data, event_data, on=['Datetime'], how='left')

# Handle missing values in the final merged dataset
# For simplicity, let's fill missing values with a placeholder 'Unknown'.
Integrated_Dataset.fillna({
    'Event Name': 'No Event',
    'Location': 'NA',
    'Type': 'NA',
    'Description': 'NA',
    'Source': 'NA'
}, inplace=True)

#Displaying Final Integrated Dataset after handling missing values
print(Integrated_Dataset)

       Datetime  Junction  Vehicles           ID  Temperature  Humidity  \
0    2015-11-01         1        15  20151101001         17.0      75.6   
1    2015-11-02         1        14  20151102001         16.8      77.4   
2    2015-11-03         1        21  20151103001         16.2      78.1   
3    2015-11-04         1        18  20151104001         16.1      72.9   
4    2015-11-05         1        19  20151105001         16.7      62.9   
...         ...       ...       ...          ...          ...       ...   
2000 2017-06-26         4         6  20170626004         28.7      70.2   
2001 2017-06-27         4         9  20170627004         28.4      79.8   
2002 2017-06-28         4         6  20170628004         25.2      91.3   
2003 2017-06-29         4        14  20170629004         24.6      92.5   
2004 2017-06-30         4         9  20170630004         25.6      89.9   

      Precipitation  Windspeed                Event Name   Location  \
0             0.000        8

### (ii). Normalizing or standardizing data to bring different variables to a common scale.

In [113]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
Integrated_Dataset['vehicle_scaled'] = scaler.fit_transform(Integrated_Dataset[['Vehicles']])

# Display normalized data
print("\nNormalized Integrated Dataset :")
print(Integrated_Dataset.head(10))



Normalized Integrated Dataset :
    Datetime  Junction  Vehicles           ID  Temperature  Humidity  \
0 2015-11-01         1        15  20151101001         17.0      75.6   
1 2015-11-02         1        14  20151102001         16.8      77.4   
2 2015-11-03         1        21  20151103001         16.2      78.1   
3 2015-11-04         1        18  20151104001         16.1      72.9   
4 2015-11-05         1        19  20151105001         16.7      62.9   
5 2015-11-06         1        16  20151106001         15.6      67.5   
6 2015-11-07         1        16  20151107001         15.4      73.0   
7 2015-11-08         1        13  20151108001         15.6      71.6   
8 2015-11-09         1        13  20151109001         15.6      72.9   
9 2015-11-10         1        19  20151110001         15.8      69.4   

   Precipitation  Windspeed                Event Name   Location  \
0            0.0        8.3           Diwali Festival  Pan India   
1            0.3        7.9  Pro Kabad

### ->  Saving the Final Merged dataset to CSV.

In [114]:
Integrated_Dataset.to_csv('Integrated_Dataset.csv', index=False)

### -> Importing the Integrated Dataset.

In [115]:
df = pd.read_csv('Integrated_Dataset.csv')
df

Unnamed: 0,Datetime,Junction,Vehicles,ID,Temperature,Humidity,Precipitation,Windspeed,Event Name,Location,Type,Description,vehicle_scaled
0,2015-11-01,1,15,20151101001,17.0,75.6,0.000,8.3,Diwali Festival,Pan India,Public Holiday,Annual Hindu festival of lights,-0.446144
1,2015-11-02,1,14,20151102001,16.8,77.4,0.300,7.9,Pro Kabaddi League Final,Hyderabad,Sports,Final match of the Pro Kabaddi League Season 2,-0.498381
2,2015-11-03,1,21,20151103001,16.2,78.1,0.800,8.3,No Event,,,,-0.132718
3,2015-11-04,1,18,20151104001,16.1,72.9,0.000,6.5,No Event,,,,-0.289431
4,2015-11-05,1,19,20151105001,16.7,62.9,1.500,9.0,No Event,,,,-0.237193
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2000,2017-06-26,4,6,20170626004,28.7,70.2,0.900,24.1,No Event,,,,-0.916283
2001,2017-06-27,4,9,20170627004,28.4,79.8,5.344,22.3,No Event,,,,-0.759570
2002,2017-06-28,4,6,20170628004,25.2,91.3,19.666,27.7,No Event,,,,-0.916283
2003,2017-06-29,4,14,20170629004,24.6,92.5,4.591,13.0,No Event,,,,-0.498381
