# init

In [49]:
import numpy as np
import time
import requests
import pandas as pd
from tqdm import tqdm
from datetime import datetime, timedelta
from geopy.geocoders import get_geocoder_for_service
import openmeteo_requests
import requests_cache
from retry_requests import retry

In [28]:
!pip install openmeteo-requests
!pip install requests-cache retry-requests numpy pandas
!pip install geopy



## Import Dataset

In [29]:
df = pd.read_csv('input/filtered_dataset_oneliner.csv')

## Functions

In [31]:
# Function to get Latitude and Longitudes
def get_lat_long(place):
    if len(place) > 30:
        print(f"Skipping place '{place}' as length is more than 30 characters.")
        return None, None
    
    url = f"https://geocode.maps.co/search?q={place.replace(' ', '+')}&api_key={apikey}"
    response = requests.get(url)
    if response.status_code == 200:
        try:
            data = response.json()
            if data:
                return data[0]['lat'], data[0]['lon']
        except ValueError as e:
            print(f"Error parsing JSON: {e}")
    else:
        print(f"Error: {response.status_code}, {response.text}")
    return None, None

In [50]:
#Function to extract states
def geocode(geocoder, config, query):
    cls = get_geocoder_for_service(geocoder)
    geolocator = cls(**config)
    location = geolocator.geocode(query)
    return location.address

In [33]:
#Extracting dates and Average Weather Functions

def extract_date(date_str):
    date_str = date_str.strip() if date_str else ''  # Remove leading and trailing whitespaces, handle None
    try:
        return datetime.strptime(date_str, 'Updated: %b %d, %Y, %H:%M IST').strftime('%Y-%m-%d')
    except ValueError:
        try:
            return datetime.strptime(date_str, '%b %d, %Y, %H:%M IST').strftime('%Y-%m-%d')
        except ValueError:
            return '1970-01-01'  # Return a default date for invalid dates or None

def get_week_avg_weather(lat, lon, start_date, end_date):
    if pd.isnull(lat):  # Skip rows with missing latitude
        return None
    
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": start_date,
        "end_date": end_date,
        "hourly": "temperature_2m"
    }
    responses = openmeteo.weather_api("https://archive-api.open-meteo.com/v1/archive", params=params)
    response = responses[0]  # Assuming only one location is being queried
    hourly = response.Hourly()
    hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
    return pd.Series(hourly_temperature_2m).mean()

In [34]:
# Extracting date for precipitation
def extract_date(date_str):
    date_str = date_str.strip()  # Remove leading and trailing whitespaces
    try:
        return datetime.strptime(date_str, 'Updated: %b %d, %Y, %H:%M IST').strftime('%Y-%m-%d')
    except ValueError:
        return datetime.strptime(date_str, '%b %d, %Y, %H:%M IST').strftime('%Y-%m-%d')

In [35]:
# Exracting Precipitation Data - 3 day average

def get_precipitation_3days(lat, lon, start_date):
    if pd.isnull(lat):  # Skip rows with missing latitude
        return None
    
    total_precipitation = 0
    for i in range(-1, 2):  # Loop for the current day and the two days before and after
        date = (datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=i)).strftime('%Y-%m-%d')
        params = {
            "latitude": lat,
            "longitude": lon,
            "start_date": date,
            "end_date": date,
            "hourly": "precipitation"
        }
        responses = openmeteo.weather_api("https://archive-api.open-meteo.com/v1/archive", params=params)
        response = responses[0]  # Assuming only one location is being queried
        hourly_precipitation = response.Hourly().Variables(0).ValuesAsNumpy()
        total_precipitation += hourly_precipitation.sum()
    
    return total_precipitation

# PLACE OF ACCIDENT

In [36]:
# Split 'content' column by ':' and extract the first part
df['place'] = df['content'].str.split(':').str[0].str.strip().str.title()

# LATITUDE AND LONGITUDE

In [37]:
with open("geocode_api.txt") as apikey_file:
    apikey = apikey_file.readline()
# Function to get latitude and longitude for a place

# Apply function to DataFrame with delay
df['latitude'] = None
df['longitude'] = None
for index, row in tqdm(df.iterrows(), total=len(df)):
    if len(row['place']) <= 30:
        latitude, longitude = get_lat_long(row['place'])
        df.at[index, 'latitude'] = latitude
        df.at[index, 'longitude'] = longitude
    else:
        print(f"Skipping place '{row['place']}' as length is more than 30 characters.")
    time.sleep(1)  # Delay of 1 second

# Display the DataFrame with latitude and longitude
print(df)

100%|██████████| 5/5 [00:06<00:00,  1.34s/it]

            id                                               link  \
5581  71099572  https://timesofindia.indiatimes.com/city/chand...   
540   99448268  https://timesofindia.indiatimes.com/india/4-in...   
2049  96043991  https://timesofindia.indiatimes.com/city/indor...   
3435  90648272  https://timesofindia.indiatimes.com/city/mumba...   
2730  78991646  https://timesofindia.indiatimes.com/city/amara...   

                                                content  \
5581  AMBALA: At least six people were injured after...   
540   KATHMANDU : Four Indians were killed and anoth...   
2049  Indore: Two persons were killed and one injure...   
3435  MUMBAI: Two second-year college students were ...   
2730  HYDERABAD: Four red sanders smugglers were cha...   

                             news_date  \
5581  Updated: Sep 12, 2019, 19:38 IST   
540   Updated: Apr 13, 2023, 06:54 IST   
2049   Updated: Dec 7, 2022, 09:29 IST   
3435            Apr 4, 2022, 21:51 IST   
2730            Nov 




# STATES

In [51]:
place = list(df['place'])

In [52]:
from tqdm import tqdm

# Dictionary of Indian states
indian_states = {
    'Andaman and Nicobar Islands', 'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh', 'Chhattisgarh',
    'Dadra and Nagar Haveli and Daman and Diu', 'Delhi', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir',
    'Jharkhand', 'Karnataka', 'Kerala', 'Ladakh', 'Lakshadweep', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya',
    'Mizoram', 'Nagaland', 'Odisha', 'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura',
    'Uttar Pradesh', 'Uttarakhand', 'West Bengal'
}

states = []

for i in tqdm(range(len(place)), desc="Processing"):
    try:
        split_result = geocode("nominatim", dict(user_agent="aakarshsurendra"), place[i]).split(',')
        state = None
        for part in split_result:
            part = part.strip()
            if part in indian_states:
                state = part
                break
            elif part and not any(char.isdigit() or char.isalpha() for char in part):
                # Skip parts that are not alphanumeric (e.g., blank, unwanted characters)
                continue
        states.append(state)
    except:
        states.append(None)

Processing: 100%|██████████| 5/5 [00:02<00:00,  2.14it/s]


In [53]:
df['state']=states

In [58]:
df

Unnamed: 0,id,link,content,news_date,first_line,place,latitude,longitude,state,week_avg_weather,precipitation_3days
5581,71099572,https://timesofindia.indiatimes.com/city/chand...,AMBALA: At least six people were injured after...,1970-01-01,At least six people were injured after a car p...,Ambala,30.3843674,76.770421,Haryana,13.874687,0.0
540,99448268,https://timesofindia.indiatimes.com/india/4-in...,KATHMANDU : Four Indians were killed and anoth...,1970-01-01,Four Indians were killed and another was serio...,Kathmandu,27.708317,85.3205817,,10.140416,0.0
2049,96043991,https://timesofindia.indiatimes.com/city/indor...,Indore: Two persons were killed and one injure...,1970-01-01,Two persons were killed and one injured in thr...,Indore,22.7203616,75.8681996,Madhya Pradesh,19.730093,0.0
3435,90648272,https://timesofindia.indiatimes.com/city/mumba...,MUMBAI: Two second-year college students were ...,1970-01-01,Two second-year college students were killed a...,Mumbai,19.08157715,72.88662753964906,Maharashtra,24.148325,0.0
2730,78991646,https://timesofindia.indiatimes.com/city/amara...,HYDERABAD: Four red sanders smugglers were cha...,1970-01-01,Four red sanders smugglers were charred to dea...,Hyderabad,17.360589,78.4740613,Telangana,20.315657,0.0


# WEATHER AND PRECIPITATION

In [55]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)


# Add a new column 'week_avg_weather' to the DataFrame
df['news_date'] = df['news_date'].apply(extract_date)

tqdm.pandas(desc="Calculating weather")
df['week_avg_weather'] = df.progress_apply(lambda row: get_week_avg_weather(row['latitude'], row['longitude'], row['news_date'], (datetime.strptime(row['news_date'], '%Y-%m-%d') + timedelta(days=7)).strftime('%Y-%m-%d')), axis=1)

print(df[['place', 'news_date', 'week_avg_weather']])

Calculating weather: 100%|██████████| 5/5 [00:00<00:00, 403.75it/s]

          place   news_date  week_avg_weather
5581     Ambala  1970-01-01         13.874687
540   Kathmandu  1970-01-01         10.140416
2049     Indore  1970-01-01         19.730093
3435     Mumbai  1970-01-01         24.148325
2730  Hyderabad  1970-01-01         20.315657





## Precipitation

In [56]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

# Add a new column 'precipitation_3days' to the DataFrame
df['news_date'] = df['news_date'].apply(extract_date)

tqdm.pandas()
df['precipitation_3days'] = df.progress_apply(lambda row: get_precipitation_3days(row['latitude'], row['longitude'], row['news_date']), axis=1)

100%|██████████| 5/5 [00:00<00:00, 209.06it/s]


In [59]:
df.to_csv('output/entity_extracted.csv', index = False)

In [None]:
import pandas as pd

# Find major cities (places with more than 20 occurrences)
major_cities = df['place'].value_counts()[df['place'].value_counts() > 20].index.tolist()

# Initialize a dictionary to store combined city data
combined_cities = {}

# Iterate through each major city
for city in major_cities:
    # Find other cities within +/- 0.2 latitude and longitude difference
    nearby_cities = df[(df['place'] != city) & 
                       (df['latitude'].between(df[df['place'] == city]['latitude'].iloc[0] - 0.2, 
                                               df[df['place'] == city]['latitude'].iloc[0] + 0.2)) &
                       (df['longitude'].between(df[df['place'] == city]['longitude'].iloc[0] - 0.2, 
                                                df[df['place'] == city]['longitude'].iloc[0] + 0.2))]
    
    # Combine the cities into the major city
    combined_cities[city] = nearby_cities['place'].tolist()

# Update the DataFrame with the combined city names and adjust latitudes and longitudes
for major_city, cities_to_combine in combined_cities.items():
    # Update place names to the major city name
    df.loc[df['place'].isin(cities_to_combine), 'place'] = major_city
    # Update latitudes and longitudes to the major city's values
    df.loc[df['place'] == major_city, 'latitude'] = df[df['place'] == major_city]['latitude'].mean()
    df.loc[df['place'] == major_city, 'longitude'] = df[df['place'] == major_city]['longitude'].mean()

# Save the updated DataFrame to a new Excel file
df.to_excel('updated_cities.xlsx', index=False)
