# ENTITY EXTRACTION

## PLACE OF ACCIDENT

In [None]:
places = df['content'].str.split(':').str[0].str.strip()
df['place'] = places
df['place'] = df['place'].str.title()

## CREATING IDs FOR EACH NEWS

In [None]:
# Function to extract id from link
def extract_id(link):
    match = re.search(r'\b(\d{6,11})\b', link)
    if match:
        return match.group(1)
    else:
        return None

# Apply the function to extract ids
df['id'] = df['Link'].apply(extract_id)
order = ['id','place','Link','content','News_date','First_Line']
df = df[order]

In [None]:
# Split 'place' column by spaces and calculate token count
df['places_token_count'] = df['place'].str.split(' ').apply(lambda x: len(x) if isinstance(x, list) else np.nan)

# Replace NaN values with 0
df['places_token_count'] = df['places_token_count'].fillna(0)

# Convert to integer
df['places_token_count'] = df['places_token_count'].astype(int)

# Display the updated dataframe
print(df)

### Verifying if places_token_count values above 5 are unwanted news

In [None]:
import matplotlib.pyplot as plt

counts_1 = len(df[df['places_token_count'] == 1])
counts_2 = len(df[df['places_token_count'] == 2])
counts_3 = len(df[df['places_token_count'] == 3])
counts_4 = len(df[df['places_token_count'] == 4])
counts_5 = len(df[df['places_token_count'] == 5])
counts_above_5 = len(df[df['places_token_count'] > 5])
categories = ['1', '2', '3', '4', '5', 'Above 5']
counts = [counts_1, counts_2, counts_3, counts_4, counts_5, counts_above_5]

plt.figure(figsize=(10, 6))
plt.bar(categories, counts, edgecolor='black')
plt.xlabel('Token Count')
plt.ylabel('Frequency')
plt.title('Distribution of Token Counts in places_token_count')
plt.show()

In [None]:
df.drop(columns =['places_token_count'],inplace = True)

In [None]:
df.to_csv('dataframe_clustering.csv', index =False)

## LATITUDE AND LONGITUDE

In [None]:
import requests
import pandas as pd
import time
from tqdm import tqdm

with open("geocode_api.txt") as apikey_file:
    apikey = apikey_file.readline()
# Function to get latitude and longitude for a place
def get_lat_long(place):
    if len(place) > 30:
        print(f"Skipping place '{place}' as length is more than 30 characters.")
        return None, None
    
    url = f"https://geocode.maps.co/search?q={place.replace(' ', '+')}&api_key={apikey}"
    response = requests.get(url)
    if response.status_code == 200:
        try:
            data = response.json()
            if data:
                return data[0]['lat'], data[0]['lon']
        except ValueError as e:
            print(f"Error parsing JSON: {e}")
    else:
        print(f"Error: {response.status_code}, {response.text}")
    return None, None


# Apply function to DataFrame with delay
df['latitude'] = None
df['longitude'] = None
for index, row in tqdm(df.iterrows(), total=len(df)):
    if len(row['place']) <= 30:
        latitude, longitude = get_lat_long(row['place'])
        df.at[index, 'latitude'] = latitude
        df.at[index, 'longitude'] = longitude
    else:
        print(f"Skipping place '{row['place']}' as length is more than 30 characters.")
    time.sleep(1)  # Delay of 1 second

# Display the DataFrame with latitude and longitude
print(df)

In [None]:
df.to_csv('dataframe_lat_long.csv')

## STATES

In [13]:
import pandas as pd
states_data = pd.read_csv('dataframe_lat_long.csv')

In [18]:
place = list(states_data['place'])

In [19]:
from geopy.geocoders import get_geocoder_for_service

In [20]:
def geocode(geocoder, config, query):
    cls = get_geocoder_for_service(geocoder)
    geolocator = cls(**config)
    location = geolocator.geocode(query)
    return location.address

In [21]:
from tqdm import tqdm

# Dictionary of Indian states
indian_states = {
    'Andaman and Nicobar Islands', 'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh', 'Chhattisgarh',
    'Dadra and Nagar Haveli and Daman and Diu', 'Delhi', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir',
    'Jharkhand', 'Karnataka', 'Kerala', 'Ladakh', 'Lakshadweep', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya',
    'Mizoram', 'Nagaland', 'Odisha', 'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura',
    'Uttar Pradesh', 'Uttarakhand', 'West Bengal'
}

states = []

for i in tqdm(range(len(place)), desc="Processing"):
    try:
        split_result = geocode("nominatim", dict(user_agent="aakarshsurendra"), place[i]).split(',')
        state = None
        for part in split_result:
            part = part.strip()
            if part in indian_states:
                state = part
                break
            elif part and not any(char.isdigit() or char.isalpha() for char in part):
                # Skip parts that are not alphanumeric (e.g., blank, unwanted characters)
                continue
        states.append(state)
    except:
        states.append(None)


Processing: 100%|██████████| 6750/6750 [40:02<00:00,  2.81it/s]   


In [22]:
states_data['state']=states

In [24]:
states_data.to_csv('dataframe_states.csv')

## WEATHER AND PRECIPITATION

In [None]:
pip install openmeteo-requests

In [None]:
pip install requests-cache retry-requests numpy pandas

In [1]:
import pandas as pd
from datetime import datetime, timedelta
import openmeteo_requests
import requests_cache
from retry_requests import retry
from tqdm import tqdm

# Read the CSV file into a DataFrame
df = pd.read_csv('dataframe_states.csv')

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

def extract_date(date_str):
    date_str = date_str.strip()  # Remove leading and trailing whitespaces
    try:
        return datetime.strptime(date_str, 'Updated: %b %d, %Y, %H:%M IST').strftime('%Y-%m-%d')
    except ValueError:
        return datetime.strptime(date_str, '%b %d, %Y, %H:%M IST').strftime('%Y-%m-%d')

def get_week_avg_weather(lat, lon, start_date, end_date):
    if pd.isnull(lat):  # Skip rows with missing latitude
        return None
    
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": start_date,
        "end_date": end_date,
        "hourly": "temperature_2m"
    }
    responses = openmeteo.weather_api("https://archive-api.open-meteo.com/v1/archive", params=params)
    response = responses[0]  # Assuming only one location is being queried
    hourly = response.Hourly()
    hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
    return pd.Series(hourly_temperature_2m).mean()

# Add a new column 'week_avg_weather' to the DataFrame
df['News_date'] = df['News_date'].apply(extract_date)

tqdm.pandas(desc="Calculating weather")
df['week_avg_weather'] = df.progress_apply(lambda row: get_week_avg_weather(row['latitude'], row['longitude'], row['News_date'], (datetime.strptime(row['News_date'], '%Y-%m-%d') + timedelta(days=7)).strftime('%Y-%m-%d')), axis=1)

print(df[['place', 'News_date', 'week_avg_weather']])

Calculating weather: 100%|██████████| 6750/6750 [01:36<00:00, 69.62it/s]  

                              place   News_date  week_avg_weather
0                         Sultanpur  2024-02-23         18.672459
1                            Jaipur  2024-02-19         18.577219
2                           Raichur  2024-02-18         29.642168
3                         New Delhi  2024-02-16         18.050730
4                         Hyderabad  2024-02-22         26.445761
...                             ...         ...               ...
6745                         Rajkot  2019-07-30         27.055136
6746                      Bengaluru  2019-05-28         25.609011
6747                Ambala/Parwanoo  2019-05-04               NaN
6748                        Madurai  2019-06-04         30.240919
6749  Padiyan Ka Purwa (Rae Bareli)  2018-05-10               NaN

[6750 rows x 3 columns]





In [3]:
df.to_csv('dataset_weather.csv')

### Precipitation

In [2]:
import pandas as pd
from datetime import datetime, timedelta
import openmeteo_requests
import requests_cache
from retry_requests import retry
from tqdm import tqdm

# Read the CSV file into a DataFrame
df = pd.read_csv('dataset_weather.csv')

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

def extract_date(date_str):
    date_str = date_str.strip()  # Remove leading and trailing whitespaces
    try:
        return datetime.strptime(date_str, 'Updated: %b %d, %Y, %H:%M IST').strftime('%Y-%m-%d')
    except ValueError:
        return datetime.strptime(date_str, '%b %d, %Y, %H:%M IST').strftime('%Y-%m-%d')

def get_precipitation_3days(lat, lon, start_date):
    if pd.isnull(lat):  # Skip rows with missing latitude
        return None
    
    total_precipitation = 0
    for i in range(-1, 2):  # Loop for the current day and the two days before and after
        date = (datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=i)).strftime('%Y-%m-%d')
        params = {
            "latitude": lat,
            "longitude": lon,
            "start_date": date,
            "end_date": date,
            "hourly": "precipitation"
        }
        responses = openmeteo.weather_api("https://archive-api.open-meteo.com/v1/archive", params=params)
        response = responses[0]  # Assuming only one location is being queried
        hourly_precipitation = response.Hourly().Variables(0).ValuesAsNumpy()
        total_precipitation += hourly_precipitation.sum()
    
    return total_precipitation

# Add a new column 'precipitation_3days' to the DataFrame
df['News_date'] = df['News_date'].apply(extract_date)

tqdm.pandas()
df['precipitation_3days'] = df.progress_apply(lambda row: get_precipitation_3days(row['latitude'], row['longitude'], row['News_date']), axis=1)

print(df[['place', 'News_date', 'precipitation_3days']])

100%|██████████| 6750/6750 [07:56<00:00, 14.16it/s] 

                              place   News_date  precipitation_3days
0                         Sultanpur  2024-02-23             0.200000
1                            Jaipur  2024-02-19             0.000000
2                           Raichur  2024-02-18             0.000000
3                         New Delhi  2024-02-16             0.000000
4                         Hyderabad  2024-02-22             0.000000
...                             ...         ...                  ...
6745                         Rajkot  2019-07-30            73.299995
6746                      Bengaluru  2019-05-28             5.900000
6747                Ambala/Parwanoo  2019-05-04                  NaN
6748                        Madurai  2019-06-04            30.000000
6749  Padiyan Ka Purwa (Rae Bareli)  2018-05-10                  NaN

[6750 rows x 3 columns]





In [3]:
# Save the processed data to a CSV file
df.to_csv('entity_data.csv', index=False)

### Combining Small Cities for Visualization

In [29]:
import pandas as pd

# Read the Excel file into a DataFrame
df = pd.read_csv('dataframe_states.csv')

# Find major cities (places with more than 20 occurrences)
major_cities = df['place'].value_counts()[df['place'].value_counts() > 20].index.tolist()

# Initialize a dictionary to store combined city data
combined_cities = {}

# Iterate through each major city
for city in major_cities:
    # Find other cities within +/- 0.2 latitude and longitude difference
    nearby_cities = df[(df['place'] != city) & 
                       (df['latitude'].between(df[df['place'] == city]['latitude'].iloc[0] - 0.2, 
                                               df[df['place'] == city]['latitude'].iloc[0] + 0.2)) &
                       (df['longitude'].between(df[df['place'] == city]['longitude'].iloc[0] - 0.2, 
                                                df[df['place'] == city]['longitude'].iloc[0] + 0.2))]
    
    # Combine the cities into the major city
    combined_cities[city] = nearby_cities['place'].tolist()

# Update the DataFrame with the combined city names and adjust latitudes and longitudes
for major_city, cities_to_combine in combined_cities.items():
    # Update place names to the major city name
    df.loc[df['place'].isin(cities_to_combine), 'place'] = major_city
    # Update latitudes and longitudes to the major city's values
    df.loc[df['place'] == major_city, 'latitude'] = df[df['place'] == major_city]['latitude'].mean()
    df.loc[df['place'] == major_city, 'longitude'] = df[df['place'] == major_city]['longitude'].mean()

# Save the updated DataFrame to a new Excel file
df.to_excel('updated_cities.xlsx', index=False)
