# init

In [1]:
import random
import pandas as pd
import spacy
import numpy as np
import pickle
import re
import matplotlib.pyplot as plt
import requests
import time
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

  _torch_pytree._register_pytree_node(


In [None]:
!pip install -U spacy
!pip install -U spacy-transformers
!python -m spacy download en_core_web_trf
!pip install sentence_transformers

In [2]:
!python -m spacy validate

  _torch_pytree._register_pytree_node(
[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation:
/Users/aakarshsurendra/anaconda3/lib/python3.11/site-packages/spacy[0m

NAME              SPACY            VERSION                            
en_core_web_sm    >=3.7.2,<3.8.0   [38;5;2m3.7.1[0m   [38;5;2m✔[0m
en_core_web_lg    >=3.7.2,<3.8.0   [38;5;2m3.7.1[0m   [38;5;2m✔[0m
en_core_web_trf   >=3.7.2,<3.8.0   [38;5;2m3.7.3[0m   [38;5;2m✔[0m



## Functions

In [2]:
# Function to extract id from link
def extract_id(link):
    match = re.search(r'\b(\d{6,11})\b', link)
    if match:
        return match.group(1)
    else:
        return None

In [3]:
# Function to preprocess text using SpaCy
def preprocess_text_spacy(text):
    # Tokenize text and remove stopwords, punctuation, and lemmatize tokens
    doc = nlp(text)
    processed_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return processed_tokens

In [4]:
def One_Liners(dataset):
    nlp = spacy.load("en_core_web_trf")
    extracted_text = []

    for content in dataset['content']:
        doc = nlp(content)
        lines = content.split('\n')
        found = False
        for line in lines:
            if ":" in line:
                first_line_after_colon = line.split(":", 1)[1].strip()
                text_till_period = first_line_after_colon.split(".")[0]
                extracted_text.append(text_till_period)
                found = True
                break

        if not found:
            # Take the first row if no colon is found
            extracted_text.append(content.split(".")[0])

    # Append the extracted text to the dataset with a new column named "First Line"
    dataset['first_line'] = extracted_text

In [5]:
# Load the transformer model
nlp = spacy.load("en_core_web_trf")

#Picking the output from the 01_extraction
file_path = '../01_extraction/output/data_road.csv'

dataframe = pd.read_csv(file_path)

  _torch_pytree._register_pytree_node(


In [6]:
dataframe = dataframe.sample(5)

### CREATING IDs FOR EACH NEWS

In [7]:
# Apply the function to extract ids
dataframe['id'] = dataframe['link'].apply(extract_id)
order = ['id','link','content','news_date']
dataframe = dataframe[order]

# STAGE I CLUSTERING

#### Run only once and store the output TF-IDF vector file as pickle -

In [8]:
#If we need to run TF-IDF for fresh news, set run to True
run = True

In [9]:
if run == True:

    # Complete data (list of article texts)
    article_texts = dataframe.content

    # Handling NaN values by replacing them with empty strings
    article_texts_cleaned = article_texts.fillna('')

    # Initialize TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(tokenizer=preprocess_text_spacy)

    # Fit and transform the data
    tfidf_matrix = tfidf_vectorizer.fit_transform(dataframe)

    #### Exporting the pickle file

    # Specify the file path where you want to save the TF-IDF matrix
    file_path = "tfidf_matrix.pkl"

    # Save the TF-IDF matrix to a file
    with open(file_path, 'wb') as f:
        pickle.dump(tfidf_matrix, f)



## Load Pickle

#### Importing the pickle file - Run this

In [10]:
if run == True:# Specify the file path where the TF-IDF matrix is saved
    file_path = "tfidf_matrix.pkl"

    # Load the TF-IDF matrix from the file
    with open(file_path, 'rb') as f:
        tfidf_matrix = pickle.load(f)

In [11]:
tfidf_matrix

<4x4 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

### Clustering - To remove the unrelated news

In [None]:
# Initialize K-means clustering
num_clusters = 2  # You can adjust this based on the number of desired clusters
# kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans = KMeans(n_clusters=num_clusters, init='k-means++', n_init=1, random_state=42, verbose=1)

# Fit the clustering model
kmeans.fit(tfidf_matrix)

# Get the cluster labels
cluster_labels = kmeans.labels_

Initialization complete


### PCA

In [None]:
# Get cluster centers from K-means model
cluster_centers = kmeans.cluster_centers_

# Visualize clusters with cluster centers
plt.figure(figsize=(10, 8))
for cluster_label in range(num_clusters):
    plt.scatter(tfidf_matrix_2d[cluster_labels == cluster_label, 0],
                tfidf_matrix_2d[cluster_labels == cluster_label, 1],
                label=f'Cluster {cluster_label}')
plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1],
            marker='x', color='black', label='Cluster Centers')
plt.title('Clustering of News Articles on Road Accidents with Cluster Centers')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()


### Cluster Extraction, Model Difference and Extracting the Unwanted News Dataset

In [None]:
stage1_cluster_0_points = [i for i in range(len(tfidf_matrix_2d)) if cluster_labels[i] == 0]
stage1_cluster_1_points = [i for i in range(len(tfidf_matrix_2d)) if cluster_labels[i] == 1]

In [None]:
random_10_elements = random.sample(model1_cluster_0_points, k=15)
print(random_10_elements)

#### Verifying the unwanted news

In [None]:
for index in random_10_elements:
    news_article = dataframe.loc[index, 'content']
    print(f"News Article {index}:\n{news_article}\n")

#### EXTRACT THE STAGE I FILTERNED NEWS

In [None]:
#Filtered Dataset from Stage I Clustering
filtered_dataset_1 = dataframe.drop(stage1_cluster_0_points)

# Reset the index of the new DataFrame
filtered_dataset_1.reset_index(drop=True, inplace=True)
filtered_dataset_1.drop(columns = ['Unnamed: 0'], inplace = True, errors='ignore')

In [None]:
filtered_dataset_1['id'] = filtered_dataset_1['link'].apply(extract_id)
order = ['id','link','content','news_date']
filtered_dataset_1 = filtered_dataset_1[order]

#### EXTRACT THE UNWANTED NEWS TO FEED TO STAGE II CLUSTERING

In [None]:
unwanted_news_stage1 = dataframe.drop(stage1_cluster_1_points)
unwanted_news_stage_1.to_csv('Unwanted_News.csv', index = False)

# STAGE II CLUSTERING

### SIMILARITY CHECK USING TRANSFORMER

In [None]:
pip install -U sentence-transformers

In [None]:
# Load a pre-trained SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Example statement
example_statement = "Udaipur: Three members of a family from Banswara district were killed in a road accident on Monday morning when the car they were travelling in rammed into a stationary truck on eight-lane Delhi-Mumbai expressway near Mandsaur in Madhya Pradesh. Three others travelling in the car were seriously injured. The incident took place around 6am when the car with members of a family rammed into a stationary truck. Two women occupants and a man were killed, and three other family members were grievously injured, said the police. The family from Banswara‚Äôs Mohan Colony was traveling by car from Mandsaur via Ratlam towards Banswara. A truck was parked on the highway after its tyre burst. Due to fog and low visibility, the car rammed into the truck from behind. In the accident, Ruchi Upadhyay (55), Deepika Trivedi (42) died on the spot while Gopesh Upadhyay (57) died during treatment. Three others, including the driver who were seriously injured, were taken to Ratlam Medical College Hospital."

# Assuming your dataset is stored in a DataFrame called 'unwanted_news'
# Filter out any non-string values from the 'content' column
unwanted_news = unwanted_news[unwanted_news['content'].apply(lambda x: isinstance(x, str))]

# Compute embeddings for the example statement
example_statement_embedding = model.encode([example_statement])[0]

# Compute embeddings for each content value in the dataset
content_embeddings = model.encode(unwanted_news['content'].tolist())

# Calculate cosine similarity between the example statement embedding and each content embedding
similarities = cosine_similarity([example_statement_embedding], content_embeddings)[0]

# Add the similarity scores as a new column in the DataFrame
unwanted_news['similarity_score'] = similarities

In [None]:
# Assuming 'unwanted_news' is your DataFrame
similar_news = unwanted_news.loc[unwanted_news['similarity_score'] > 0.60]

similar_news.drop(columns = ['similarity_score','Unnamed: 0'], inplace = True, errors='ignore')

## EXTRACTING STAGE II FILTERED DATA

In [None]:
similar_news['id'] = similar_news['link'].apply(extract_id)
order = ['id','link','content','news_date']
filtered_dataset_2 = similar_news[order]

In [None]:
filtered_dataset = pd.concat([filtered_dataset_1, filtered_dataset_2], ignore_index=True)
filtered_dataset

In [None]:
filtered_dataset.to_csv('filtered_dataset.csv')

# DATA CLEANING

In [None]:
df = pd.read_csv('filtered_dataset.csv')

In [None]:
# Extracting the first line from the filtered news
One_Liners(df)

In [None]:
df.to_csv('one_liner.csv')

In [None]:
df.to_csv('dataframe_lat_long.csv')

In [13]:
import pandas as pd
states_data = pd.read_csv('dataframe_lat_long.csv')

In [18]:
place = list(states_data['place'])

In [20]:
def geocode(geocoder, config, query):
    cls = get_geocoder_for_service(geocoder)
    geolocator = cls(**config)
    location = geolocator.geocode(query)
    return location.address

In [21]:
from tqdm import tqdm

# Dictionary of Indian states
indian_states = {
    'Andaman and Nicobar Islands', 'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh', 'Chhattisgarh',
    'Dadra and Nagar Haveli and Daman and Diu', 'Delhi', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir',
    'Jharkhand', 'Karnataka', 'Kerala', 'Ladakh', 'Lakshadweep', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya',
    'Mizoram', 'Nagaland', 'Odisha', 'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura',
    'Uttar Pradesh', 'Uttarakhand', 'West Bengal'
}

states = []

for i in tqdm(range(len(place)), desc="Processing"):
    try:
        split_result = geocode("nominatim", dict(user_agent="aakarshsurendra"), place[i]).split(',')
        state = None
        for part in split_result:
            part = part.strip()
            if part in indian_states:
                state = part
                break
            elif part and not any(char.isdigit() or char.isalpha() for char in part):
                # Skip parts that are not alphanumeric (e.g., blank, unwanted characters)
                continue
        states.append(state)
    except:
        states.append(None)


Processing: 100%|██████████| 6750/6750 [40:02<00:00,  2.81it/s]   


In [22]:
states_data['state']=states

In [24]:
states_data.to_csv('dataframe_states.csv')

In [None]:
pip install openmeteo-requests

In [None]:
pip install requests-cache retry-requests numpy pandas

In [1]:
import pandas as pd
from datetime import datetime, timedelta
import openmeteo_requests
import requests_cache
from retry_requests import retry
from tqdm import tqdm

# Read the CSV file into a DataFrame
df = pd.read_csv('dataframe_states.csv')

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

def extract_date(date_str):
    date_str = date_str.strip()  # Remove leading and trailing whitespaces
    try:
        return datetime.strptime(date_str, 'Updated: %b %d, %Y, %H:%M IST').strftime('%Y-%m-%d')
    except ValueError:
        return datetime.strptime(date_str, '%b %d, %Y, %H:%M IST').strftime('%Y-%m-%d')

def get_week_avg_weather(lat, lon, start_date, end_date):
    if pd.isnull(lat):  # Skip rows with missing latitude
        return None
    
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": start_date,
        "end_date": end_date,
        "hourly": "temperature_2m"
    }
    responses = openmeteo.weather_api("https://archive-api.open-meteo.com/v1/archive", params=params)
    response = responses[0]  # Assuming only one location is being queried
    hourly = response.Hourly()
    hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
    return pd.Series(hourly_temperature_2m).mean()

# Add a new column 'week_avg_weather' to the DataFrame
df['News_date'] = df['News_date'].apply(extract_date)

tqdm.pandas(desc="Calculating weather")
df['week_avg_weather'] = df.progress_apply(lambda row: get_week_avg_weather(row['latitude'], row['longitude'], row['News_date'], (datetime.strptime(row['News_date'], '%Y-%m-%d') + timedelta(days=7)).strftime('%Y-%m-%d')), axis=1)

print(df[['place', 'News_date', 'week_avg_weather']])

Calculating weather: 100%|██████████| 6750/6750 [01:36<00:00, 69.62it/s]  

                              place   News_date  week_avg_weather
0                         Sultanpur  2024-02-23         18.672459
1                            Jaipur  2024-02-19         18.577219
2                           Raichur  2024-02-18         29.642168
3                         New Delhi  2024-02-16         18.050730
4                         Hyderabad  2024-02-22         26.445761
...                             ...         ...               ...
6745                         Rajkot  2019-07-30         27.055136
6746                      Bengaluru  2019-05-28         25.609011
6747                Ambala/Parwanoo  2019-05-04               NaN
6748                        Madurai  2019-06-04         30.240919
6749  Padiyan Ka Purwa (Rae Bareli)  2018-05-10               NaN

[6750 rows x 3 columns]





In [3]:
df.to_csv('dataset_weather.csv')

### Precipitation

In [2]:
import pandas as pd
from datetime import datetime, timedelta
import openmeteo_requests
import requests_cache
from retry_requests import retry
from tqdm import tqdm

# Read the CSV file into a DataFrame
df = pd.read_csv('dataset_weather.csv')

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

def extract_date(date_str):
    date_str = date_str.strip()  # Remove leading and trailing whitespaces
    try:
        return datetime.strptime(date_str, 'Updated: %b %d, %Y, %H:%M IST').strftime('%Y-%m-%d')
    except ValueError:
        return datetime.strptime(date_str, '%b %d, %Y, %H:%M IST').strftime('%Y-%m-%d')

def get_precipitation_3days(lat, lon, start_date):
    if pd.isnull(lat):  # Skip rows with missing latitude
        return None
    
    total_precipitation = 0
    for i in range(-1, 2):  # Loop for the current day and the two days before and after
        date = (datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=i)).strftime('%Y-%m-%d')
        params = {
            "latitude": lat,
            "longitude": lon,
            "start_date": date,
            "end_date": date,
            "hourly": "precipitation"
        }
        responses = openmeteo.weather_api("https://archive-api.open-meteo.com/v1/archive", params=params)
        response = responses[0]  # Assuming only one location is being queried
        hourly_precipitation = response.Hourly().Variables(0).ValuesAsNumpy()
        total_precipitation += hourly_precipitation.sum()
    
    return total_precipitation

# Add a new column 'precipitation_3days' to the DataFrame
df['News_date'] = df['News_date'].apply(extract_date)

tqdm.pandas()
df['precipitation_3days'] = df.progress_apply(lambda row: get_precipitation_3days(row['latitude'], row['longitude'], row['News_date']), axis=1)

print(df[['place', 'News_date', 'precipitation_3days']])

100%|██████████| 6750/6750 [07:56<00:00, 14.16it/s] 

                              place   News_date  precipitation_3days
0                         Sultanpur  2024-02-23             0.200000
1                            Jaipur  2024-02-19             0.000000
2                           Raichur  2024-02-18             0.000000
3                         New Delhi  2024-02-16             0.000000
4                         Hyderabad  2024-02-22             0.000000
...                             ...         ...                  ...
6745                         Rajkot  2019-07-30            73.299995
6746                      Bengaluru  2019-05-28             5.900000
6747                Ambala/Parwanoo  2019-05-04                  NaN
6748                        Madurai  2019-06-04            30.000000
6749  Padiyan Ka Purwa (Rae Bareli)  2018-05-10                  NaN

[6750 rows x 3 columns]





In [3]:
# Save the processed data to a CSV file
df.to_csv('precipitation_processed_data.csv', index=False)

### Combining Small Cities for Visualization