In [1]:
import numpy as np
import pandas as pd
import os

In [None]:
# Read traffic data
traffic_df = pd.read_csv('trafikkdata.csv', sep="[|;]", engine='python')


In [None]:
# Only keep sum columns
traffic_df = traffic_df[traffic_df['Felt'] == 'Totalt']

In [None]:
# Replace missing values with nan
traffic_df['Trafikkmengde'] = traffic_df['Trafikkmengde'].replace('-', np.nan)

In [None]:
# Drop unecessary columns
traffic_df = traffic_df[['Dato','Fra tidspunkt','Trafikkmengde']] # må finne ut om jeg vil bruke fra eller til

In [None]:
# Make a single DateTime column
traffic_df['Tidspunkt'] = pd.to_datetime(traffic_df['Dato'] + ' ' + traffic_df['Fra tidspunkt'])

traffic_df = traffic_df[['Trafikkmengde', 'Tidspunkt']]

traffic_df.set_index('Tidspunkt', inplace=True)

traffic_df

In [None]:
traffic_df.info()

In [None]:
traffic_df.shape

In [None]:
traffic_df.describe()

In [None]:
traffic_df.isnull().sum()


In [2]:
folder_path = "weather_data"
csv_files = [f for f in os.listdir('weather_data/')]
weather_df = pd.DataFrame()

In [3]:
# Read weather data
for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file)
    df = pd.read_csv(file_path)
    weather_df = pd.concat([weather_df, df], ignore_index=True)
weather_df.head(50)

Unnamed: 0,Dato,Tid,Globalstraling,Solskinstid,Lufttemperatur,Vindretning,Vindstyrke,Lufttrykk,Vindkast,Relativ luftfuktighet
0,2010-01-01,00:00,0.0,0.0,-4.6,130.0,1.1,999.0,,
1,2010-01-01,00:10,-0.7,0.0,-4.1,158.0,1.6,999.2,,
2,2010-01-01,00:20,0.0,0.0,-3.5,167.0,1.3,999.0,,
3,2010-01-01,00:30,0.0,0.0,-4.1,151.0,0.7,999.0,,
4,2010-01-01,00:40,-0.7,0.0,-4.4,148.0,0.8,998.8,,
5,2010-01-01,00:50,-0.7,0.0,-4.7,148.0,1.2,999.0,,
6,2010-01-01,01:00,0.0,0.0,-4.4,139.0,1.2,998.8,,
7,2010-01-01,01:10,-0.7,0.0,-4.3,134.0,1.0,998.7,,
8,2010-01-01,01:20,0.0,0.0,-4.7,151.0,1.2,998.7,,
9,2010-01-01,01:30,-1.4,0.0,-4.4,140.0,1.0,998.7,,


In [6]:
# proportion of the weather data that contains Relativ luftfuktighet
proportion = weather_df[weather_df["Dato"] >= "2022-01-01"].shape[0] / weather_df.shape[0]
proportion

0.11075391168620219

In [13]:
# where did vindkast start being measured?
start_vindkast = weather_df[weather_df["Vindkast"].notna()]
start_vindkast

# How many vindkast values are measured after this? And how many total measures are there after this?
total_vindkast_measures = start_vindkast.shape[0]
total_measures = weather_df[weather_df["Dato"] >= "2015-01-08"].shape[0]
print(f'Total measures overall: {total_measures}\nTotal vindkast measures: {total_vindkast_measures}\nDifference: {total_measures-total_vindkast_measures}')



Total measures overall: 445746
Total vindkast measures: 445653
Difference: 93


In [None]:
weather_df = weather_df.drop(columns=['Globalstraling', 'Vindretning', 'Relativ luftfuktighet']) # kanskje droppe flere

In [None]:
# Make Datetime column
weather_df['Tidspunkt'] = pd.to_datetime(weather_df['Dato'] + ' ' + weather_df['Tid'])
weather_df = weather_df.drop(columns=['Dato', 'Tid'])

In [None]:
weather_df

In [None]:
# Check for duplicat Tidspunkt values
duplicates = weather_df.duplicated().any()
duplicates

In [None]:
weather_df.set_index('Tidspunkt', inplace=True)

In [None]:
# Change 9999 vals to NaN
weather_df = weather_df.replace(9999.99, np.nan)

In [None]:
# Make datetime column hourly instead of each 10 min
resampled_df = weather_df.resample('H').agg(
    {'Solskinstid':'sum', 'Lufttemperatur': 'mean', 'Vindstyrke': 'mean', 'Lufttrykk': 'mean', 'Vindkast': 'mean' })

In [None]:
# Check that new values make sense, ex. that max solskinstid <= 60
resampled_df.describe()

In [None]:
merged_df = traffic_df.merge(resampled_df, left_index=True, right_index=True) # must consider if i want to left or right join
merged_df

Vi må se litt på trafikkmengde kolonnen.

Hvorfor mangler det data her? Vi kan vurdere å droppe disse radene, men per nå gjør vi ingenting



#### Some visualizations to understand the data