# Geocoding

## Question 1

In [11]:
import json

def process_tweets(json_file_path):
    # Read JSON file
    with open(json_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # Dictionary to store the count of tweets per city and country
    city_count = {}
    country_count = {}

    # Iterate through tweets
    for tweet in data:
        place = tweet.get('place', {})
        if place:
            full_name = place.get('full_name', None)

            # Check if 'full_name' is present before extracting city and country
            if full_name:
                city, country = map(str.strip, full_name.split(','))

                # Update city count
                if city:
                    city_count[city] = city_count.get(city, 0) + 1

                # Update country count
                if country:
                    country_count[country] = country_count.get(country, 0) + 1

    # Print results in descending order
    print("City/Number of tweets:")
    for city, count in sorted(city_count.items(), key=lambda x: x[1], reverse=True):
        print(f"{city}: {count} tweets")

    print("\nCountry/Number of tweets:")
    for country, count in sorted(country_count.items(), key=lambda x: x[1], reverse=True):
        print(f"{country}: {count} tweets")


json_file_path = "./Data/FIFAWorldCup2022.json"
process_tweets(json_file_path)

City/Number of tweets:
Georgia: 2 tweets
Doha: 2 tweets
Dubai: 1 tweets
Muscat: 1 tweets
Mollet del Vallès: 1 tweets
Zouagha: 1 tweets
Stockholm: 1 tweets
Al Rayyan: 1 tweets

Country/Number of tweets:
Qatar: 3 tweets
USA: 2 tweets
United Arab Emirates: 1 tweets
Oman: 1 tweets
España: 1 tweets
Royaume du Maroc: 1 tweets
Sweden: 1 tweets


## Question 2 
### PART A

In [20]:
import pandas as pd
from datetime import datetime

def load_data(json_file_path):
    with open(json_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def create_dataframe(data, follower_or_followee):
    df = pd.DataFrame(data)
    df['follower_or_followee'] = follower_or_followee
    df['creation_time'] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S +0000 %Y')
    df = df[['id', 'name', 'screen_name', 'follower_or_followee', 'location', 'description',
             'followers_count', 'friends_count', 'favourites_count', 'creation_time',
             'statuses_count', 'verified']]
    return df

# File paths
followers_file_path = "./Data/followers.json"
followees_file_path = "./Data/followees.json"

# Load data
followers_data = load_data(followers_file_path)
followees_data = load_data(followees_file_path)

# Create DataFrames
followers_df = create_dataframe(followers_data, 'follower')
followees_df = create_dataframe(followees_data, 'followee')

# Concatenate DataFrames
result_df = pd.concat([followers_df, followees_df], ignore_index=True)

# Display the resulting DataFrame
print("Part A")
print(result_df)



Part A
                      id               name     screen_name  \
0    1236629541327495169     Tasnim Sultana  tasnimmuna2610   
1    1602456658344542208      sajad heidary   sajadheidary4   
2             1798659276       Junyuan Hong          hjy836   
3              102933899           Jennifer     EstiZhafira   
4    1265285662489468928              scoji          scoji3   
..                   ...                ...             ...   
210              8143682      Jure Leskovec            jure   
211             62044012  Michael Bronstein     mmbronstein   
212   850892377627742209         Tyler Derr   TylersNetwork   
213  1190461842922995712          CIKM 2021        CIKM2021   
214             13334762             GitHub          github   

    follower_or_followee                    location  \
0               follower                  Bangladesh   
1               follower                       Eywan   
2               follower                         USA   
3           

## Question 2
### PART B

In [21]:
# Part B

# Compute metrics
average_followers_count_followers = round(followers_df['followers_count'].mean(), 2)
average_followers_count_followees = round(followees_df['followers_count'].mean(), 2)
average_followees_count_followers = round(followers_df['friends_count'].mean(), 2)
average_followees_count_followees = round(followees_df['friends_count'].mean(), 2)

num_verified_followers = followers_df[followers_df['verified'] == True].shape[0]
num_verified_followees = followees_df[followees_df['verified'] == True].shape[0]
num_verified_combined = result_df[result_df['verified'] == True].shape[0]

average_favorites_count_followers = round(followers_df['favourites_count'].mean(), 2)
average_favorites_count_followees = round(followees_df['favourites_count'].mean(), 2)
average_favorites_count_combined = round(result_df['favourites_count'].mean(), 2)

average_statuses_count_followers = round(followers_df['statuses_count'].mean(), 2)
average_statuses_count_followees = round(followees_df['statuses_count'].mean(), 2)
average_statuses_count_combined = round(result_df['statuses_count'].mean(), 2)

# Extract the number of followers and followees per year
result_df['creation_year'] = result_df['creation_time'].dt.year
followers_per_year = result_df[result_df['follower_or_followee'] == 'follower'].groupby('creation_year').size()
followees_per_year = result_df[result_df['follower_or_followee'] == 'followee'].groupby('creation_year').size()

# Display the computed metrics
print("I) Average followers count of your followers:", average_followers_count_followers)
print("II) Average followers count of your followees:", average_followers_count_followees)
print("III) Average followees count of your followers:", average_followees_count_followers)
print("IV) Average followees count of your followees:", average_followees_count_followees)
print("V) Number of your verified followers:", num_verified_followers)
print("VI) Average favorites count of your followers:", average_favorites_count_followers)
print("VII) Average number of tweets of your followers:", average_statuses_count_followers)
print("VIII) Number of your followers per year:")
print(followers_per_year)
print("IX) Number of your followees per year:")
print(followees_per_year)

I) Average followers count of your followers: 2983.21
II) Average followers count of your followees: 1075218.86
III) Average followees count of your followers: 1055.49
IV) Average followees count of your followees: 1019.7
V) Number of your verified followers: 1
VI) Average favorites count of your followers: 1899.68
VII) Average number of tweets of your followers: 654.21
VIII) Number of your followers per year:
creation_year
2008     1
2009     3
2010     4
2011     5
2012     4
2013     3
2014     4
2015     3
2016     7
2017     4
2018     6
2019     9
2020    13
2021    13
2022    12
dtype: int64
IX) Number of your followees per year:
creation_year
2007     4
2008     5
2009    16
2010     8
2011     9
2012     9
2013     6
2014     6
2015     8
2016     8
2017     8
2018     3
2019    10
2020    10
2021     8
2022     6
dtype: int64


## Question 4

In [51]:
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import http.client
import traceback

def scrape_projects(url):
    # Parse the URL to get the host and path
    parsed_url = urlparse(url)
    host = parsed_url.netloc
    path = parsed_url.path

    try:
        # Create an HTTP connection without SSL verification
        connection = http.client.HTTPSConnection(host)

        # Send the GET request
        connection.request("GET", path)

        # Get the response
        response = connection.getresponse()

        # Check if the request was successful (status code 200)
        if response.status == 200:
            # Read and decode the response content
            content = response.read().decode("utf-8")

            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(content, 'html.parser')

            # Find all project containers
            project_containers = soup.find_all('div', class_='project-container')

            # List to store project information
            projects = []

            # Iterate through project containers
            for container in project_containers:
                # Extract title and description
                title = container.find('h3').text.strip()
                description = container.find('p').text.strip()

                # Extract list of areas
                areas_list = [area.text.strip() for area in container.find_all('li')]

                # Append the project information to the projects list
                projects.append((title, description, areas_list))

            return projects
        else:
            print(f"Failed to retrieve content. Status code: {response.status}")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        traceback.print_exc()  # Print the full traceback for debugging
        return None

# URL of the research projects page
url = "https://cs.usu.edu/people/HamidKarimi/projects.html"

# Scrape projects
result = scrape_projects(url)

# Display the result
if result:
    for i, (title, description, areas_list) in enumerate(result, start=1):
        print(f"Project {i}:")
        print(f"Title: {title}")
        print(f"Description: {description}")
        print(f"Areas: {areas_list}")
        print("\n")
else:
    print("Scraping failed.")

An error occurred: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:992)
Scraping failed.


Traceback (most recent call last):
  File "/var/folders/9q/yldjq0js2ng8cysrh56tt9xw0000gn/T/ipykernel_66556/2280067102.py", line 17, in scrape_projects
    connection.request("GET", path)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/http/client.py", line 1282, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/http/client.py", line 1328, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/http/client.py", line 1277, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/http/client.py", line 1037, in _send_output
    self.send(msg)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/http/client.py", line 975, in send
    self.connect()
  File "