In [1]:
from pymongo import MongoClient
import json
import glob
import os
import re
import datetime


In [2]:
# Function to import data from a JSON file
def import_weather_data(file_path, schema, collection, city):
    with open(file_path) as file:
        data = json.load(file)
        for item in data:
            # Check if 'tavg', 'tmin', and 'tmax' exist in the item and are not None
            if 'tavg' not in item or item['tavg'] is None:
                continue
            if 'tmin' not in item or item['tmin'] is None:
                item['tmin'] = item['tavg']
            if 'tmax' not in item or item['tmax'] is None:
                item['tmax'] = item['tavg']

            item['city'] = city

            collection.insert_one(item)


In [3]:
# Function to import data from a JSON file
def import_futures_data(file_path, schema, collection, label):
    with open(file_path) as file:
        data = json.load(file)
        futures_data = []
        current_year = None
        for item in data:
            date_str = item['Date']
            year = datetime.datetime.strptime(date_str, "%Y-%m-%d").year
            if current_year is None:
                current_year = year
            elif year != current_year:
                # Process the futures data for the previous year
                process_futures_data(futures_data, schema, collection, label)
                futures_data = []
                current_year = year
            futures_data.append(item)
        
        # Process the last futures data
        process_futures_data(futures_data, schema, collection, label)

def process_futures_data(futures_data, schema, collection, label):
    # Insert the labeled data into the collection
    for item in futures_data:
        item['Label'] = label
    collection.insert_many(futures_data)

In [4]:
# Establish a connection to your MongoDB server
client = MongoClient('mongodb://localhost:27017/')
db = client['project_3_db']  
weather_collection = db['weather_data']
futures_collection = db['futures_data']


In [5]:
# Define the schema for the weather data
weather_schema = {
    'date': 'date',
    'tavg_fahrenheit': 'double',
    'tmin_fahrenheit': 'double',
    'tmax_fahrenheit': 'double'
}

# Define the schema for the futures data
futures_schema = {
    'Date': 'date',
    'Open': 'double',
    'High': 'double',
    'Low': 'double',
    'Close': 'double',
    'Adj_Close': 'double',
    'Volume': 'int',
    'ATR': 'double'
}

In [6]:
base_dir = os.getcwd()

# Specify the relative directory paths for weather and futures data
weather_directory = os.path.join(base_dir, 'data', 'weather')
futures_directory = os.path.join(base_dir, 'data', 'futures')

In [7]:
# Import weather data from multiple files for each city
cities = ['Los Angeles', 'New York City', 'Chicago', 'Detroit', 'Columbus', 'Philadelphia', 'Newark', 'Houston', 'Indianapolis', 'Milwaukee']

for city in cities:
    collection = db['weather_data']  # Use the same collection for all cities
    city_file_path = os.path.join(weather_directory, f'{city}_data.json')
    import_weather_data(city_file_path, weather_schema, collection, city)

Skipping item due to missing or null 'tavg_fahrenheit': {'date': '2011-01-01', 'tavg': 26.8, 'tmin': 19.6, 'tmax': 35.4, 'city': 'Los Angeles'}
Skipping item due to missing or null 'tavg_fahrenheit': {'date': '2011-01-02', 'tavg': 27.7, 'tmin': 21.0, 'tmax': 31.8, 'city': 'Los Angeles'}
Skipping item due to missing or null 'tavg_fahrenheit': {'date': '2011-01-03', 'tavg': 29.1, 'tmin': 21.9, 'tmax': 34.2, 'city': 'Los Angeles'}
Skipping item due to missing or null 'tavg_fahrenheit': {'date': '2011-01-04', 'tavg': 29.1, 'tmin': 20.5, 'tmax': 38.7, 'city': 'Los Angeles'}
Skipping item due to missing or null 'tavg_fahrenheit': {'date': '2011-01-05', 'tavg': 31.1, 'tmin': 24.8, 'tmax': 37.4, 'city': 'Los Angeles'}
Skipping item due to missing or null 'tavg_fahrenheit': {'date': '2011-01-06', 'tavg': 27.1, 'tmin': 20.8, 'tmax': 33.8, 'city': 'Los Angeles'}
Skipping item due to missing or null 'tavg_fahrenheit': {'date': '2011-01-07', 'tavg': 25.5, 'tmin': 15.8, 'tmax': 36.1, 'city': 'Los An

Skipping item due to missing or null 'tavg_fahrenheit': {'date': '2012-01-13', 'tavg': 38.8, 'tmin': 30.0, 'tmax': 51.1, 'city': 'Philadelphia'}
Skipping item due to missing or null 'tavg_fahrenheit': {'date': '2012-01-14', 'tavg': 32.2, 'tmin': 28.4, 'tmax': 35.6, 'city': 'Philadelphia'}
Skipping item due to missing or null 'tavg_fahrenheit': {'date': '2012-01-15', 'tavg': 27.5, 'tmin': 24.8, 'tmax': 32.0, 'city': 'Philadelphia'}
Skipping item due to missing or null 'tavg_fahrenheit': {'date': '2012-01-16', 'tavg': 30.2, 'tmin': 17.6, 'tmax': 42.8, 'city': 'Philadelphia'}
Skipping item due to missing or null 'tavg_fahrenheit': {'date': '2012-01-17', 'tavg': 48.6, 'tmin': 41.0, 'tmax': 55.4, 'city': 'Philadelphia'}
Skipping item due to missing or null 'tavg_fahrenheit': {'date': '2012-01-18', 'tavg': 42.3, 'tmin': 30.2, 'tmax': 55.4, 'city': 'Philadelphia'}
Skipping item due to missing or null 'tavg_fahrenheit': {'date': '2012-01-19', 'tavg': 31.8, 'tmin': 21.2, 'tmax': 41.0, 'city': '

In [8]:
# Import futures data from multiple files
futures_files = os.listdir(futures_directory)
for index, file in enumerate(futures_files):
    if file.endswith('.json'):
        label = f'Winter {index + 1}'
        file_path = os.path.join(futures_directory, file)
        import_futures_data(file_path, futures_schema, futures_collection, label)

In [9]:
# Close the MongoDB connection
client.close()