# IMPORTING AND FILTERING THE NEWS

In [None]:
!pip install -U spacy
!pip install -U spacy-transformers
!python -m spacy download en_core_web_trf

In [None]:
import pandas as pd
import spacy
import pickle
import re

# Load the transformer model
nlp = spacy.load("en_core_web_trf")


file_path = '/Users/aakarshsurendra/Desktop/ROAD_NLP/02_dataset/data_road.csv'

dataframe = pd.read_csv(file_path)


In [None]:
!python -m spacy validate

### EXTRACTING PLACE OF ACCIDENT

In [None]:
places = dataframe['content'].str.split(':').str[0].str.strip()
dataframe['place'] = places

### CREATING IDs FOR EACH NEWS

In [None]:
# Function to extract id from link
def extract_id(link):
    match = re.search(r'\b(\d{6,11})\b', link)
    if match:
        return match.group(1)
    else:
        return None

# Apply the function to extract ids
dataframe['id'] = dataframe['Link'].apply(extract_id)
order = ['id','Link','content','News_date','place']
dataframe = dataframe[order]

## TEXT PRE-PROCESSING FUNCTION

In [None]:
# Function to preprocess text using SpaCy
def preprocess_text_spacy(text):
    # Tokenize text and remove stopwords, punctuation, and lemmatize tokens
    doc = nlp(text)
    processed_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return processed_tokens

# STAGE I CLUSTERING

## DO NOT RUN

#### Run only once and store the output TF-IDF vector file as pickle -

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Complete data (list of article texts)
article_texts = dataframe.content

# Handling NaN values by replacing them with empty strings
article_texts_cleaned = article_texts.fillna('')

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=preprocess_text_spacy)

# Fit and transform the data
tfidf_matrix = tfidf_vectorizer.fit_transform(dataframe)

#### Exporting the pickle file

# Specify the file path where you want to save the TF-IDF matrix
file_path = "tfidf_matrix.pkl"

# Save the TF-IDF matrix to a file
with open(file_path, 'wb') as f:
    pickle.dump(tfidf_matrix, f)

## Load Pickle

#### Importing the pickle file - Run this

In [None]:
# Specify the file path where the TF-IDF matrix is saved
file_path = "/Users/aakarshsurendra/Desktop/ROAD_NLP/07_pickle_files/tfidf_matrix.pkl"

# Load the TF-IDF matrix from the file
with open(file_path, 'rb') as f:
    tfidf_matrix = pickle.load(f)

### Clustering - To remove the unrelated news

In [None]:
from sklearn.cluster import KMeans

# Initialize K-means clustering
num_clusters = 2  # You can adjust this based on the number of desired clusters
# kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans = KMeans(n_clusters=num_clusters, init='k-means++', n_init=18, random_state=42)

# Fit the clustering model
kmeans.fit(tfidf_matrix)

# Get the cluster labels
cluster_labels = kmeans.labels_

### PCA

In [None]:
import numpy as np

# Get cluster centers from K-means model
cluster_centers = kmeans.cluster_centers_

# Visualize clusters with cluster centers
plt.figure(figsize=(10, 8))
for cluster_label in range(num_clusters):
    plt.scatter(tfidf_matrix_2d[cluster_labels == cluster_label, 0],
                tfidf_matrix_2d[cluster_labels == cluster_label, 1],
                label=f'Cluster {cluster_label}')
plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1],
            marker='x', color='black', label='Cluster Centers')
plt.title('Clustering of News Articles on Road Accidents with Cluster Centers')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()


### Cluster Extraction, Model Difference and Extracting the Unwanted News Dataset

In [None]:
stage1_cluster_0_points = [i for i in range(len(tfidf_matrix_2d)) if cluster_labels[i] == 0]
stage1_cluster_1_points = [i for i in range(len(tfidf_matrix_2d)) if cluster_labels[i] == 1]

In [None]:
import random
random_10_elements = random.sample(model1_cluster_0_points, k=15)
print(random_10_elements)

#### Verifying the unwanted news

In [None]:
import random

for index in random_10_elements:
    news_article = dataframe.loc[index, 'content']
    print(f"News Article {index}:\n{news_article}\n")

#### EXTRACT THE STAGE I FILTERNED NEWS

In [None]:
#Filtered Dataset from Stage I Clustering
filtered_dataset_1 = dataframe.drop(stage1_cluster_0_points)

# Reset the index of the new DataFrame
filtered_dataset_1.reset_index(drop=True, inplace=True)
filtered_dataset_1.drop(columns = ['Unnamed: 0'], inplace = True, errors='ignore')

In [None]:
filtered_dataset_1['id'] = filtered_dataset_1['Link'].apply(extract_id)
order = ['id','Link','content','News_date']
filtered_dataset_1 = filtered_dataset_1[order]

#### EXTRACT THE UNWANTED NEWS TO FEED TO STAGE II CLUSTERING

In [None]:
unwanted_news_stage1 = dataframe.drop(stage1_cluster_1_points)
unwanted_news_stage_1.to_csv('Unwanted_News.csv', index = False)

# STAGE II CLUSTERING

## DO NOT RUN

#### Vectorize the Unwanted News

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Complete data (list of article texts)
article_texts = unwanted_news.content

# Handling NaN values by replacing them with empty strings
unwanted_article_texts_cleaned = article_texts.fillna('')

# Initialize TF-IDF vectorizer
unwanted_tfidf_vectorizer = TfidfVectorizer(tokenizer=preprocess_text_spacy)

# Fit and transform the data
unwanted_tfidf_matrix = unwanted_tfidf_vectorizer.fit_transform(unwanted_article_texts_cleaned)

#### Exporting the pickle file - DO NOT RUN THIS

In [None]:
import pickle

# Specify the file path where you want to save the TF-IDF matrix
file_path = "unwanted_tfidf_matrix.pkl"

# Save the TF-IDF matrix to a file
with open(file_path, 'wb') as f:
    pickle.dump(unwanted_tfidf_matrix, f)

## Importing the pickle file - RUN THIS

In [None]:
import pickle

unwanted_news = pd.read_csv('unwanted_News.csv')
# Specify the file path where the TF-IDF matrix is saved
file_path = "unwanted_tfidf_matrix.pkl"

# Load the TF-IDF matrix from the file
with open(file_path, 'rb') as f:
    unwanted_tfidf_matrix = pickle.load(f)

## STAGE II CLUSTERING/SIMILARITY CHECK USING TRANSFORMER

In [None]:
pip install -U sentence-transformers

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load a pre-trained SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Example statement
example_statement = "Udaipur: Three members of a family from Banswara district were killed in a road accident on Monday morning when the car they were travelling in rammed into a stationary truck on eight-lane Delhi-Mumbai expressway near Mandsaur in Madhya Pradesh. Three others travelling in the car were seriously injured. The incident took place around 6am when the car with members of a family rammed into a stationary truck. Two women occupants and a man were killed, and three other family members were grievously injured, said the police. The family from Banswara‚Äôs Mohan Colony was traveling by car from Mandsaur via Ratlam towards Banswara. A truck was parked on the highway after its tyre burst. Due to fog and low visibility, the car rammed into the truck from behind. In the accident, Ruchi Upadhyay (55), Deepika Trivedi (42) died on the spot while Gopesh Upadhyay (57) died during treatment. Three others, including the driver who were seriously injured, were taken to Ratlam Medical College Hospital."

# Assuming your dataset is stored in a DataFrame called 'unwanted_news'
# Filter out any non-string values from the 'content' column
unwanted_news = unwanted_news[unwanted_news['content'].apply(lambda x: isinstance(x, str))]

# Compute embeddings for the example statement
example_statement_embedding = model.encode([example_statement])[0]

# Compute embeddings for each content value in the dataset
content_embeddings = model.encode(unwanted_news['content'].tolist())

# Calculate cosine similarity between the example statement embedding and each content embedding
similarities = cosine_similarity([example_statement_embedding], content_embeddings)[0]

# Add the similarity scores as a new column in the DataFrame
unwanted_news['similarity_score'] = similarities

In [None]:
# Assuming 'unwanted_news' is your DataFrame
similar_news = unwanted_news.loc[unwanted_news['similarity_score'] > 0.60]

similar_news.drop(columns = ['similarity_score','Unnamed: 0'], inplace = True, errors='ignore')

## EXTRACTING STAGE II FILTERED DATA

In [None]:
similar_news['id'] = similar_news['Link'].apply(extract_id)
order = ['id','Link','content','News_date']
filtered_dataset_2 = similar_news[order]

In [None]:
filtered_dataset = pd.concat([filtered_dataset_1, filtered_dataset_2], ignore_index=True)
filtered_dataset

In [None]:
filtered_dataset.to_csv('filtered_dataset.csv')

# EXTRACTING FIRST SENTENCE

In [None]:
df = pd.read_csv('filtered_dataset.csv')

In [None]:
import spacy
def One_Liners(dataset):
    nlp = spacy.load("en_core_web_trf")
    extracted_text = []

    for content in dataset['content']:
        doc = nlp(content)
        lines = content.split('\n')
        found = False
        for line in lines:
            if ":" in line:
                first_line_after_colon = line.split(":", 1)[1].strip()
                text_till_period = first_line_after_colon.split(".")[0]
                extracted_text.append(text_till_period)
                found = True
                break

        if not found:
            # Take the first row if no colon is found
            extracted_text.append(content.split(".")[0])

    # Append the extracted text to the dataset with a new column named "First Line"
    dataset['First_Line'] = extracted_text

In [None]:
One_Liners(df)

In [None]:
df.to_csv('filtered_dataset_oneliner.csv', index = False)

In [None]:
df = pd.read_csv('filtered_dataset_oneliner.csv')

### EXTRACTING PLACE OF ACCIDENT

In [None]:
places = df['content'].str.split(':').str[0].str.strip()
df['place'] = places
df['place'] = df['place'].str.title()

### CREATING IDs FOR EACH NEWS

In [None]:
# Function to extract id from link
def extract_id(link):
    match = re.search(r'\b(\d{6,11})\b', link)
    if match:
        return match.group(1)
    else:
        return None

# Apply the function to extract ids
df['id'] = df['Link'].apply(extract_id)
order = ['id','place','Link','content','News_date','First_Line']
df = df[order]

In [None]:
# Split 'place' column by spaces and calculate token count
df['places_token_count'] = df['place'].str.split(' ').apply(lambda x: len(x) if isinstance(x, list) else np.nan)

# Replace NaN values with 0
df['places_token_count'] = df['places_token_count'].fillna(0)

# Convert to integer
df['places_token_count'] = df['places_token_count'].astype(int)

# Display the updated dataframe
print(df)

### Verifying if places_token_count values above 5 are unwanted news

In [None]:
import matplotlib.pyplot as plt

counts_1 = len(df[df['places_token_count'] == 1])
counts_2 = len(df[df['places_token_count'] == 2])
counts_3 = len(df[df['places_token_count'] == 3])
counts_4 = len(df[df['places_token_count'] == 4])
counts_5 = len(df[df['places_token_count'] == 5])
counts_above_5 = len(df[df['places_token_count'] > 5])
categories = ['1', '2', '3', '4', '5', 'Above 5']
counts = [counts_1, counts_2, counts_3, counts_4, counts_5, counts_above_5]

plt.figure(figsize=(10, 6))
plt.bar(categories, counts, edgecolor='black')
plt.xlabel('Token Count')
plt.ylabel('Frequency')
plt.title('Distribution of Token Counts in places_token_count')
plt.show()

In [None]:
df.drop(columns =['places_token_count'],inplace = True)

In [None]:
df.to_csv('dataframe_clustering.csv', index =False)

## Extracting Latitude and Longitude Co-Ordinates

In [None]:
import requests
import pandas as pd
import time
from tqdm import tqdm

# Function to get latitude and longitude for a place
def get_lat_long(place):
    if len(place) > 30:
        print(f"Skipping place '{place}' as length is more than 30 characters.")
        return None, None
    
    url = f"https://geocode.maps.co/search?q={place.replace(' ', '+')}&api_key=660742ac8b0c9483916236igpc9edea"
    response = requests.get(url)
    if response.status_code == 200:
        try:
            data = response.json()
            if data:
                return data[0]['lat'], data[0]['lon']
        except ValueError as e:
            print(f"Error parsing JSON: {e}")
    else:
        print(f"Error: {response.status_code}, {response.text}")
    return None, None


# Apply function to DataFrame with delay
df['latitude'] = None
df['longitude'] = None
for index, row in tqdm(df.iterrows(), total=len(df)):
    if len(row['place']) <= 30:
        latitude, longitude = get_lat_long(row['place'])
        df.at[index, 'latitude'] = latitude
        df.at[index, 'longitude'] = longitude
    else:
        print(f"Skipping place '{row['place']}' as length is more than 30 characters.")
    time.sleep(1)  # Delay of 1 second

# Display the DataFrame with latitude and longitude
print(df)

In [None]:
df.to_csv('dataframe_lat_long.csv')

## Extracting States

In [13]:
import pandas as pd
states_data = pd.read_csv('dataframe_lat_long.csv')

In [18]:
place = list(states_data['place'])

In [19]:
from geopy.geocoders import get_geocoder_for_service

In [20]:
def geocode(geocoder, config, query):
    cls = get_geocoder_for_service(geocoder)
    geolocator = cls(**config)
    location = geolocator.geocode(query)
    return location.address

In [21]:
from tqdm import tqdm

# Dictionary of Indian states
indian_states = {
    'Andaman and Nicobar Islands', 'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh', 'Chhattisgarh',
    'Dadra and Nagar Haveli and Daman and Diu', 'Delhi', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir',
    'Jharkhand', 'Karnataka', 'Kerala', 'Ladakh', 'Lakshadweep', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya',
    'Mizoram', 'Nagaland', 'Odisha', 'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura',
    'Uttar Pradesh', 'Uttarakhand', 'West Bengal'
}

states = []

for i in tqdm(range(len(place)), desc="Processing"):
    try:
        split_result = geocode("nominatim", dict(user_agent="aakarshsurendra"), place[i]).split(',')
        state = None
        for part in split_result:
            part = part.strip()
            if part in indian_states:
                state = part
                break
            elif part and not any(char.isdigit() or char.isalpha() for char in part):
                # Skip parts that are not alphanumeric (e.g., blank, unwanted characters)
                continue
        states.append(state)
    except:
        states.append(None)


Processing: 100%|██████████| 6750/6750 [40:02<00:00,  2.81it/s]   


In [22]:
states_data['state']=states

In [24]:
states_data.to_csv('dataframe_states.csv')

# Weather Extraction

In [None]:
pip install openmeteo-requests

In [None]:
pip install requests-cache retry-requests numpy pandas

In [6]:
import pandas as pd
from datetime import datetime, timedelta
import openmeteo_requests
import requests_cache
from retry_requests import retry
from tqdm import tqdm

# Read the CSV file into a DataFrame
df = pd.read_csv('dataframe_states.csv')

# Keep the first 6100 entries
df_subset = df.iloc[:6100]

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

def extract_date(date_str):
    date_str = date_str.strip()  # Remove leading and trailing whitespaces
    try:
        return datetime.strptime(date_str, 'Updated: %b %d, %Y, %H:%M IST').strftime('%Y-%m-%d')
    except ValueError:
        return datetime.strptime(date_str, '%b %d, %Y, %H:%M IST').strftime('%Y-%m-%d')

def get_week_avg_weather(lat, lon, start_date, end_date):
    if pd.isnull(lat):  # Skip rows with missing latitude
        return None
    
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": start_date,
        "end_date": end_date,
        "hourly": "temperature_2m"
    }
    responses = openmeteo.weather_api("https://archive-api.open-meteo.com/v1/archive", params=params)
    response = responses[0]  # Assuming only one location is being queried
    hourly = response.Hourly()
    hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
    return pd.Series(hourly_temperature_2m).mean()

# Add a new column 'week_avg_weather' to the DataFrame
df_subset['News_date'] = df_subset['News_date'].apply(extract_date)

tqdm.pandas(desc="Calculating weather")
df_subset['week_avg_weather'] = df_subset.progress_apply(lambda row: get_week_avg_weather(row['latitude'], row['longitude'], row['News_date'], (datetime.strptime(row['News_date'], '%Y-%m-%d') + timedelta(days=7)).strftime('%Y-%m-%d')), axis=1)

print(df_subset[['place', 'News_date', 'week_avg_weather']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['News_date'] = df_subset['News_date'].apply(extract_date)
Calculating weather: 100%|██████████| 6100/6100 [00:04<00:00, 1278.01it/s]

          place   News_date  week_avg_weather
0     Sultanpur  2024-02-23         18.672459
1        Jaipur  2024-02-19         18.577219
2       Raichur  2024-02-18         29.642168
3     New Delhi  2024-02-16         18.050730
4     Hyderabad  2024-02-22         26.445761
...         ...         ...               ...
6095       Agra  2019-06-09         35.530506
6096    Chennai  2019-05-14         30.536737
6097      Surat  2018-10-29          8.159657
6098  Mussoorie  2019-06-09         23.302780
6099    Vellore  2015-03-23         26.487387

[6100 rows x 3 columns]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['week_avg_weather'] = df_subset.progress_apply(lambda row: get_week_avg_weather(row['latitude'], row['longitude'], row['News_date'], (datetime.strptime(row['News_date'], '%Y-%m-%d') + timedelta(days=7)).strftime('%Y-%m-%d')), axis=1)


In [8]:
df_subset.to_csv('dataset_weather.csv')

### Precipitation

In [24]:
# Read the CSV file into a DataFrame
df = pd.read_csv('dataset_weather.csv')

# Keep only the first 1815 entries
df_subset = df.iloc[:1815]

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

def extract_date(date_str):
    date_str = date_str.strip()  # Remove leading and trailing whitespaces
    try:
        return datetime.strptime(date_str, 'Updated: %b %d, %Y, %H:%M IST').strftime('%Y-%m-%d')
    except ValueError:
        return datetime.strptime(date_str, '%b %d, %Y, %H:%M IST').strftime('%Y-%m-%d')

def get_precipitation_3days(lat, lon, start_date):
    if pd.isnull(lat):  # Skip rows with missing latitude
        return None
    
    total_precipitation = 0
    for i in range(-1, 2):  # Loop for the current day and the two days before and after
        date = (datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=i)).strftime('%Y-%m-%d')
        params = {
            "latitude": lat,
            "longitude": lon,
            "start_date": date,
            "end_date": date,
            "hourly": "precipitation"
        }
        responses = openmeteo.weather_api("https://archive-api.open-meteo.com/v1/archive", params=params)
        response = responses[0]  # Assuming only one location is being queried
        hourly_precipitation = response.Hourly().Variables(0).ValuesAsNumpy()
        total_precipitation += hourly_precipitation.sum()
    
    return total_precipitation

# Add a new column 'precipitation_3days' to the DataFrame
df_subset['News_date'] = df_subset['News_date'].apply(extract_date)
df_subset['precipitation_3days'] = df_subset.progress_apply(lambda row: get_precipitation_3days(row['latitude'], row['longitude'], row['News_date']), axis=1)

print(df_subset[['place', 'News_date', 'precipitation_3days']])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['News_date'] = df_subset['News_date'].apply(extract_date)
Calculating weather: 100%|██████████| 1815/1815 [00:03<00:00, 454.61it/s]

          place   News_date  precipitation_3days
0     Sultanpur  2024-02-23             0.200000
1        Jaipur  2024-02-19             0.000000
2       Raichur  2024-02-18             0.000000
3     New Delhi  2024-02-16             0.000000
4     Hyderabad  2024-02-22             0.000000
...         ...         ...                  ...
1810   Dindigul  2021-11-07            97.800005
1811     Indore  2022-12-25             0.000000
1812     Indore  2022-12-03             0.000000
1813     Howrah  2022-12-25             0.000000
1814     Nashik  2022-09-23             4.800000

[1815 rows x 3 columns]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['precipitation_3days'] = df_subset.progress_apply(lambda row: get_precipitation_3days(row['latitude'], row['longitude'], row['News_date']), axis=1)


In [25]:
# Save the processed data to a CSV file
df_subset.to_csv('precipitation_processed_data.csv', index=False)

### Combining Small Cities for Visualization

In [29]:
import pandas as pd

# Read the Excel file into a DataFrame
df = pd.read_csv('dataframe_states.csv')

# Find major cities (places with more than 20 occurrences)
major_cities = df['place'].value_counts()[df['place'].value_counts() > 20].index.tolist()

# Initialize a dictionary to store combined city data
combined_cities = {}

# Iterate through each major city
for city in major_cities:
    # Find other cities within +/- 0.2 latitude and longitude difference
    nearby_cities = df[(df['place'] != city) & 
                       (df['latitude'].between(df[df['place'] == city]['latitude'].iloc[0] - 0.2, 
                                               df[df['place'] == city]['latitude'].iloc[0] + 0.2)) &
                       (df['longitude'].between(df[df['place'] == city]['longitude'].iloc[0] - 0.2, 
                                                df[df['place'] == city]['longitude'].iloc[0] + 0.2))]
    
    # Combine the cities into the major city
    combined_cities[city] = nearby_cities['place'].tolist()

# Update the DataFrame with the combined city names and adjust latitudes and longitudes
for major_city, cities_to_combine in combined_cities.items():
    # Update place names to the major city name
    df.loc[df['place'].isin(cities_to_combine), 'place'] = major_city
    # Update latitudes and longitudes to the major city's values
    df.loc[df['place'] == major_city, 'latitude'] = df[df['place'] == major_city]['latitude'].mean()
    df.loc[df['place'] == major_city, 'longitude'] = df[df['place'] == major_city]['longitude'].mean()

# Save the updated DataFrame to a new Excel file
df.to_excel('updated_cities.xlsx', index=False)
