In [None]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'predicta-1-0-predict-the-unpredictable:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F81884%2F8892995%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240621%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240621T064006Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D883430cf4536a23057494c2e32e2c8923b403eb8d6e8ad0ebac46b4768e11526bf238a7630986fc8f54e44db6cd9745e51792046cbd05b27c7cbeabeb8e0bea6881d3017bcce8e54699c2ea70f244e34fb268f0c69bec1e778e26d2da9696b3d6e8ad1aa3518a5304aaf2dee4cfdf501464a9e476e10c8aa4a7aa41c1d5a21858f0201eb76ad978a356408995315c01c070db856d1b1e91d736b948743bb45b441843d53b8462c286e41789c88d2384e9bbc55be7bcb883a5b3a240f367ae2f6c438b65f8329c40a8db2d7b140435529738c2a59f037c7bc347ef343abedea108b2666cc97147ee4ff0bf9e190d92f7406911a59dc28992e6501b97a7111a71c'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pmdarima

Collecting pmdarima
  Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting Cython!=0.29.18,!=0.29.31,>=0.29 (from pmdarima)
  Downloading Cython-3.0.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
Collecting statsmodels>=0.13.2 (from pmdarima)
  Downloading statsmodels-0.14.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m
Collecting patsy>=0.5.6 (from statsmodels>=0.13.2->pmdarima)
  Downloading patsy-0.5.6-py2.py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.9/233.9 kB[0

In [None]:
# Load datasets
historical_weather = pd.read_csv('/kaggle/input/predicta-1-0-predict-the-unpredictable/historical_weather.csv')
sample_submission = pd.read_csv('/kaggle/input/predicta-1-0-predict-the-unpredictable/sample_submission.csv')
submission_key = pd.read_csv('/kaggle/input/predicta-1-0-predict-the-unpredictable/submission_key.csv')

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from pmdarima import auto_arima
import warnings
from sklearn.metrics import mean_squared_error

warnings.filterwarnings("ignore")


# Display the first few rows of the datasets
historical_weather.head()

In [None]:
# Data preprocessing
# Convert 'date' columns to datetime
historical_weather['date'] = pd.to_datetime(historical_weather['date'])
submission_key['date'] = pd.to_datetime(submission_key['date'])

In [None]:
# Add the last two weeks of 2018
date_filter = (
    (historical_weather['date'].dt.year == 2018) &
    (historical_weather['date'].dt.month == 12) &
    (historical_weather['date'].dt.day >= 17)  # Assuming last two weeks means from 17th to 31st
)


historical_weather = historical_weather[(historical_weather['date'] >= '2018-06-01') & (historical_weather['date'] <='2018-12-31')]

In [None]:
historical_weather = historical_weather.dropna(subset=['avg_temp_c'])

In [None]:
city_ids = historical_weather['city_id'].unique()
city_ids

array(['C001', 'C002', 'C003', 'C004', 'C005', 'C007', 'C008', 'C009',
       'C010', 'C011', 'C012', 'C013', 'C014', 'C015', 'C016', 'C017',
       'C018', 'C020', 'C022', 'C023', 'C024', 'C025', 'C027', 'C028',
       'C029', 'C030', 'C031', 'C033', 'C034', 'C035', 'C036', 'C037',
       'C038', 'C039', 'C040', 'C042', 'C043', 'C044', 'C045', 'C046',
       'C047', 'C048', 'C049', 'C051', 'C053', 'C054', 'C055', 'C056',
       'C057', 'C058', 'C059', 'C061', 'C062', 'C064', 'C065', 'C066',
       'C067', 'C068', 'C069', 'C070', 'C071', 'C072', 'C073', 'C074',
       'C076', 'C077', 'C078', 'C079', 'C081', 'C082', 'C083', 'C084',
       'C085', 'C086', 'C087', 'C088', 'C089', 'C090', 'C091', 'C092',
       'C093', 'C094', 'C095', 'C096', 'C097', 'C098', 'C099', 'C100',
       'C101', 'C102', 'C103', 'C104', 'C105', 'C106', 'C107', 'C108',
       'C109', 'C110', 'C111', 'C112'], dtype=object)

In [None]:
# Iterate over each city
for city_id in city_ids:
    city_data = historical_weather[historical_weather['city_id'] == city_id]
    city_data.set_index('date', inplace=True)
    sub_key_data = submission_key[submission_key['city_id'] == city_id]
    sub_key_data.set_index('date', inplace=True)


    # Use only the avg_temp_c column for prediction
    city_temps = city_data['avg_temp_c']

    # Split the data into train and validation sets
    train_data = city_temps

    # Train the Auto ARIMA model on the training data
    model = auto_arima(train_data, seasonal=True, m=12, trace=True, error_action='ignore', suppress_warnings=True)

    # Make predictions for the validation period
    forecast = model.predict(n_periods=len(sub_key_data))

    forecast = pd.DataFrame(forecast)
    forecast.columns = ['avg_temp_c']

    sub_key_data['avg_temp_c'] = forecast['avg_temp_c']
    # Update the main submission_key DataFrame based on submission_ID
    submission_key.loc[submission_key['submission_ID'].isin(sub_key_data['submission_ID']), 'avg_temp_c'] = sub_key_data['avg_temp_c'].values

Performing stepwise search to minimize aic
 ARIMA(2,1,2)(1,0,1)[12] intercept   : AIC=1107.337, Time=0.67 sec
 ARIMA(0,1,0)(0,0,0)[12] intercept   : AIC=1140.799, Time=0.02 sec
 ARIMA(1,1,0)(1,0,0)[12] intercept   : AIC=1133.649, Time=0.09 sec
 ARIMA(0,1,1)(0,0,1)[12] intercept   : AIC=1131.000, Time=0.10 sec
 ARIMA(0,1,0)(0,0,0)[12]             : AIC=1138.832, Time=0.02 sec
 ARIMA(2,1,2)(0,0,1)[12] intercept   : AIC=1105.540, Time=0.37 sec
 ARIMA(2,1,2)(0,0,0)[12] intercept   : AIC=1104.254, Time=0.18 sec
 ARIMA(2,1,2)(1,0,0)[12] intercept   : AIC=1105.574, Time=0.35 sec
 ARIMA(1,1,2)(0,0,0)[12] intercept   : AIC=1112.091, Time=0.17 sec
 ARIMA(2,1,1)(0,0,0)[12] intercept   : AIC=1103.847, Time=0.16 sec
 ARIMA(2,1,1)(1,0,0)[12] intercept   : AIC=1105.189, Time=0.32 sec
 ARIMA(2,1,1)(0,0,1)[12] intercept   : AIC=1105.162, Time=0.28 sec
 ARIMA(2,1,1)(1,0,1)[12] intercept   : AIC=1107.028, Time=0.59 sec
 ARIMA(1,1,1)(0,0,0)[12] intercept   : AIC=1132.107, Time=0.07 sec
 ARIMA(2,1,0)(0,0,0

In [None]:
final_submission = submission_key.drop(columns=['city_id', 'date'])
final_submission.head()

Unnamed: 0,submission_ID,avg_temp_c
0,1,11.68025
1,2,11.197456
2,3,10.785284
3,4,10.561602
4,5,10.490797


In [None]:
# Save the final submission to CSV
final_submission.to_csv('submission.csv', index=False)
print("Submission file saved as final_submission.csv")

Submission file saved as final_submission.csv
