In [1]:
#Q1:
import pandas as pd
import numpy as np

# Load the CSV file
file_path = 'eateries.csv'
data = pd.read_csv(file_path)

# Extract latitude and longitude for the given business IDs
def get_coordinates(data, business_id):
    record = data.loc[data['business_id'] == business_id]
    if record.empty:
        raise ValueError(f"Business ID {business_id} not found in the dataset.")
    return record.iloc[0]['latitude'], record.iloc[0]['longitude']

# Haversine formula
def haversine(lat1, lon1, lat2, lon2):
    R = 6371000  # Radius of Earth in meters
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)

    a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return R * c

# Coordinates for Darwin Cafe and Celtic Catering
business_id_1 = 39528  # Darwin Cafe
business_id_2 = 66921  # Celtic Catering

lat1, lon1 = get_coordinates(data, business_id_1)
lat2, lon2 = get_coordinates(data, business_id_2)

# Calculate the distance
distance = haversine(lat1, lon1, lat2, lon2)
print(f"The distance between Darwin Cafe and Celtic Catering is {distance:.2f} meters.")

The distance between Darwin Cafe and Celtic Catering is 540.92 meters.


In [2]:
#Q2:
import pandas as pd
import numpy as np
from itertools import combinations

# Load the dataset
file_path = 'eateries.csv'
data = pd.read_csv(file_path)

# Haversine formula
def haversine(lat1, lon1, lat2, lon2):
    R = 6371000  # Earth's radius in meters
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)

    a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return R * c

# Extract relevant columns
coordinates = data[['business_id', 'latitude', 'longitude']].dropna()

# Generate all pairs of eateries
pairs = combinations(coordinates.index, 2)

# Count pairs within 967 meters
threshold_distance = 967
count = 0

for i, j in pairs:
    lat1, lon1 = coordinates.loc[i, ['latitude', 'longitude']]
    lat2, lon2 = coordinates.loc[j, ['latitude', 'longitude']]

    distance = haversine(lat1, lon1, lat2, lon2)
    if distance <= threshold_distance:
        count += 1

print(f"The number of pairs of eateries within 967 meters of each other is: {count}")


KeyboardInterrupt: 

In [20]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree

# Load the dataset
file_path = 'eateries.csv'
data = pd.read_csv(file_path)

# Extract relevant columns
coordinates = data[['business_id', 'latitude', 'longitude']].dropna()

# Prepare the data for BallTree (converting lat, lon to radians)
coords = coordinates[['latitude', 'longitude']].values

# Create the BallTree
tree = BallTree(np.radians(coords), metric='haversine')

# Query the tree to find pairs within 874 meters (converted to radians)
threshold_distance = 874 / 6371000  # Convert meters to radians (Earth's radius)
pairs_within_threshold = tree.query_radius(np.radians(coords), r=threshold_distance)

# Count the unique pairs (avoid duplicates and self-pairing)
count = 0
for i, neighbors in enumerate(pairs_within_threshold):
    for j in neighbors:
        if i < j:  # Ensure pairs (i, j) are counted once
            count += 1

print(f"The number of pairs of eateries within 874 meters of each other is: {count}")

The number of pairs of eateries within 874 meters of each other is: 432606


In [14]:
import geopandas as gpd

# Load the shapefile
shapefile_path = 'DISTRICT_BOUNDARY.shp'
gdf = gpd.read_file(shapefile_path)

# View the columns
print("Columns in the shapefile:")
print(gdf.columns)

# Optionally, you can view the first few rows to inspect the data
print("\nFirst few rows of the shapefile:")
print(gdf.head())


Columns in the shapefile:
Index(['geometry'], dtype='object')

First few rows of the shapefile:
                                            geometry
0  POLYGON ((71.27029 23.15576, 71.26987 23.13801...
1  POLYGON ((71.99842 23.49875, 72.01039 23.50096...
2  POLYGON ((73.21971 22.62603, 73.21882 22.62125...
3  MULTIPOLYGON (((69.33941 22.38597, 69.33982 22...
4  MULTIPOLYGON (((70.5365 22.40753, 70.53275 22....


In [16]:
import geopandas as gpd
from shapely.geometry import Point

# Load the shapefile
shapefile_path = "DISTRICT_BOUNDARY.shp"  # Adjust the path as needed
district_boundaries = gpd.read_file(shapefile_path)

# Points to test
points = [
    Point(82.4384, 25.9508),
    Point(83.149, 25.0315),
    Point(83.0155, 25.5037),
    Point(82.4358, 25.1944),
]

# Filter for the district "JAUNPUR" and state "UTTAR PRADESH"
district_jaunpur = district_boundaries[
    (district_boundaries["District"].str.contains("JAUNPUR", case=False, na=False)) &
    (district_boundaries["STATE"].str.contains("UTTAR PRADESH", case=False, na=False))
]

# Check if the points fall inside the filtered district
results = {}
for point in points:
    point_label = f"Point({point.x}, {point.y})"
    results[point_label] = district_jaunpur.contains(point).any()

# Print results
for point, is_inside in results.items():
    print(f"{point}: {'Inside' if is_inside else 'Outside'}")


KeyError: 'District'

In [None]:
#Q5:
import geopandas as gpd
from shapely.geometry import box

# Load the shapefile
shapefile_path = 'DISTRICT_BOUNDARY.shp'
districts = gpd.read_file(shapefile_path)

# Normalize district and state names
districts['DISTRICT'] = districts['DISTRICT'].str.strip().str.upper()
districts['STATE'] = districts['STATE'].str.strip().str.upper()

# Filter for the target district
target_district = "GARIY>BAND"
target_state = "CHHATT|SGARH"
target_district_geom = districts[
    (districts['DISTRICT'].str.contains(target_district, na=False)) &
    (districts['STATE'].str.contains(target_state, na=False))
]

if target_district_geom.empty:
    raise ValueError("The specified district and state were not found in the shapefile.")

# Extract the geometry of the target district
target_geometry = target_district_geom.unary_union

# Define the bounding box (Longitude: 82.2048 to 82.3861, Latitude: 20.2427 to 20.9378)
bounding_box = box(82.2048, 20.2427, 82.3861, 20.9378)

# Calculate the intersection area
intersection = target_geometry.intersection(bounding_box)

# Calculate areas
district_area = target_geometry.area  # District's total area
intersection_area = intersection.area  # Intersection area

# Calculate the percentage
percentage = (intersection_area / district_area) * 100

print(f"The percentage of the district's area intersected by the bounding box is {percentage:.2f}%.")

In [6]:
#Q6:
import json
from collections import defaultdict

# Load the JSON file
file_path = 'discourse-topics.json'
with open(file_path, 'r') as file:
    data = json.load(file)

# Dictionary to track the number of topics each user has posted in
user_topics = defaultdict(set)

# Iterate through topics and collect usernames for each topic
for topic in data:
    posters = topic.get("posters", [])
    for poster in posters:
        username = poster.get("username")
        if username:
            user_topics[username].add(topic["topicId"])

# Calculate the number of topics for AnupamKumarJha
anupam_topics_count = len(user_topics["AnupamKumarJha"])

# Count how many users have posted in more topics than AnupamKumarJha
users_with_more_topics = sum(
    1 for user, topics in user_topics.items() if len(topics) > anupam_topics_count
)

print(f"{users_with_more_topics} users posted in more topics than AnupamKumarJha.")

288 users posted in more topics than AnupamKumarJha.


In [7]:
#Q7:
import json
from itertools import combinations
from collections import defaultdict

# Load the JSON file
file_path = 'discourse-topics.json'
with open(file_path, 'r') as file:
    data = json.load(file)

# Dictionary to count collaborations between pairs of users
collaborations = defaultdict(int)

# Iterate through each topic to find user pairs
for topic in data:
    posters = topic.get("posters", [])
    usernames = [poster.get("username") for poster in posters if "username" in poster]

    # Generate all unique pairs of users for this topic
    for pair in combinations(sorted(usernames), 2):
        collaborations[pair] += 1

# Find the number of pairs with collaborations > 28
threshold = 28
pairs_with_more_collaborations = sum(1 for pair, count in collaborations.items() if count > threshold)

print(f"{pairs_with_more_collaborations} pairs of users collaborated on more topics than 23f1000966 and 22f3002293.")

17 pairs of users collaborated on more topics than 23f1000966 and 22f3002293.


In [8]:
#Q8:
import json
from collections import defaultdict, deque

# Load the JSON file
file_path = 'discourse-topics.json'
with open(file_path, 'r') as file:
    data = json.load(file)

# Step 1: Build the graph
graph = defaultdict(set)

for topic in data:
    posters = topic.get("posters", [])
    usernames = [poster.get("username") for poster in posters if "username" in poster]
    for i, user1 in enumerate(usernames):
        for user2 in usernames[i+1:]:
            graph[user1].add(user2)
            graph[user2].add(user1)

# Step 2: BFS to find the shortest path
def bfs_shortest_path(graph, start, end):
    queue = deque([[start]])  # Queue stores paths
    visited = set()

    while queue:
        path = queue.popleft()
        node = path[-1]

        if node == end:
            return path

        if node not in visited:
            visited.add(node)
            for neighbor in graph[node]:
                new_path = list(path)
                new_path.append(neighbor)
                queue.append(new_path)

    return None  # No path found

# Find the shortest path between gokulakrishnan and 22f3002345
shortest_path = bfs_shortest_path(graph, "gokulakrishnan", "22f3002345")

# Step 3: Check if the specified users are in the path
specified_users = {"Soni", "santhanakrishnan", "Milo", "21f3002833"}
users_in_path = set(shortest_path) & specified_users if shortest_path else set()

# Print results
if users_in_path:
    print(f"The following users are in the shortest path: {', '.join(users_in_path)}")
else:
    print("None of the specified users are in the shortest path.")

None of the specified users are in the shortest path.


In [9]:
#Q9:
import json
from collections import defaultdict, deque

# Load the JSON file
file_path = 'discourse-topics.json'
with open(file_path, 'r') as file:
    data = json.load(file)

# Step 1: Build the graph
graph = defaultdict(set)

for topic in data:
    posters = topic.get("posters", [])
    usernames = [poster.get("username") for poster in posters if "username" in poster]
    for i, user1 in enumerate(usernames):
        for user2 in usernames[i+1:]:
            graph[user1].add(user2)
            graph[user2].add(user1)

# Step 2: Modified BFS to count shortest paths
def bfs_count_shortest_paths(graph, start, end):
    queue = deque([start])
    visited = set()
    shortest_paths = {start: 1}  # Number of shortest paths to each node
    distances = {start: 0}  # Shortest distance to each node

    while queue:
        node = queue.popleft()

        # Stop early if we reach the end
        if node == end:
            continue

        for neighbor in graph[node]:
            if neighbor not in distances:
                # First time visiting this node, set its distance and paths
                distances[neighbor] = distances[node] + 1
                shortest_paths[neighbor] = shortest_paths[node]
                queue.append(neighbor)
            elif distances[neighbor] == distances[node] + 1:
                # Found another shortest path to this node
                shortest_paths[neighbor] += shortest_paths[node]

    return shortest_paths.get(end, 0)

# Step 3: Find the number of shortest paths between 21f2000705 and Yoha
start_user = "21f2000705"
end_user = "Yoha"
number_of_paths = bfs_count_shortest_paths(graph, start_user, end_user)

print(f"The number of shortest paths between {start_user} and {end_user} is {number_of_paths}.")

The number of shortest paths between 21f2000705 and Yoha is 6.


In [12]:
#Q10:
import json
from collections import defaultdict, deque

# Load the JSON file
file_path = 'discourse-topics.json'
with open(file_path, 'r') as file:
    data = json.load(file)

# Step 1: Build the graph
graph = defaultdict(set)

for topic in data:
    posters = topic.get("posters", [])
    usernames = [poster.get("username") for poster in posters if "username" in poster]
    for i, user1 in enumerate(usernames):
        for user2 in usernames[i+1:]:
            graph[user1].add(user2)
            graph[user2].add(user1)

# Step 2: Modified BFS to count shortest paths
def bfs_count_shortest_paths(graph, start, end):
    queue = deque([start])
    visited = set()
    shortest_paths = {start: 1}  # Number of shortest paths to each node
    distances = {start: 0}  # Shortest distance to each node

    while queue:
        node = queue.popleft()

        # Stop early if we reach the end
        if node == end:
            continue

        for neighbor in graph[node]:
            if neighbor not in distances:
                # First time visiting this node, set its distance and paths
                distances[neighbor] = distances[node] + 1
                shortest_paths[neighbor] = shortest_paths[node]
                queue.append(neighbor)
            elif distances[neighbor] == distances[node] + 1:
                # Found another shortest path to this node
                shortest_paths[neighbor] += shortest_paths[node]

    return shortest_paths.get(end, 0)

# Step 3: Find the number of shortest paths between 22f3001699 and 23f2000385
start_user = "22f3001699"
end_user = "23f2000385"
number_of_paths = bfs_count_shortest_paths(graph, start_user, end_user)

print(f"The number of shortest paths between {start_user} and {end_user} is {number_of_paths}.")

The number of shortest paths between 22f3001699 and 23f2000385 is 30.
