In [18]:
from pyspark.sql.functions import col, lit, expr, when, to_timestamp
from pyspark.sql.types import *
from datetime import datetime
import time
import pyspark
from pyspark.sql import SparkSession
import random
import string
import pandas as pd

In [19]:
##+===========================================================SET DATASETLENGTH HERE!!!
DataLength=1000000

In [20]:
#ORIGINAL DATASETS
#German Cities Data
german_cities_data = {
    "CityID": [1, 2, 3, 4, 5],
    "CityName": ["Berlin", "Hamburg", "Munich", "Cologne", "Frankfurt"],
    "Population": [3520000, 1790000, 1450000, 1060000, 733000],
    "Area": [891.3, 755.2, 310.7, 405.15, 500],
    "CulturalSites": [7, 6, 3, 5, 3]  
}

# French Cities Data
french_cities_data = {
    "CityID": [1, 2, 3, 4, 5],
    "CityName": ["Paris", "Marseille", "Lyon", "Toulouse", "Nice"],
    "Population": [2250000, 851000, 491000, 447000, 344000],
    "Area": [250, 240.6, 47.87, 118.3, 71.9],      
    "CulturalSites": [7, 7, 1, 4, 3]
}

In [21]:

# Function to generate random city name
def generate_city_name(length):
    letters = string.ascii_letters
    return ''.join(random.choice(letters) for i in range(length))

# Function to generate random data for cities
def generate_city_data(num_cities):
    cities_data = []
    for i in range(num_cities):
        city_name = generate_city_name(random.randint(4, 6))
        population = random.randint(350, 5000000)
        area = round(random.uniform(170, 5000), 2)
        tourist_visits = random.randint(3000000, 6000000)
        cultural_sites = random.randint(2, 7)
        cities_data.append({
            "CityID": i + 1,
            "CityName": city_name,
            "Population": population,
            "Area": area,
            "TouristVisits": tourist_visits,
            "CulturalSites": cultural_sites
        })
    return cities_data



In [22]:
# Augment German cities data
german_cities_augmented = german_cities_data.copy()
german_cities_augmented["CityID"].extend(range(6, DataLength+6))
german_cities_augmented["CityName"].extend([city_data["CityName"] for city_data in generate_city_data(DataLength)])
german_cities_augmented["Population"].extend([city_data["Population"] for city_data in generate_city_data(DataLength)])
german_cities_augmented["Area"].extend([city_data["Area"] for city_data in generate_city_data(DataLength)])
german_cities_augmented["CulturalSites"].extend([city_data["CulturalSites"] for city_data in generate_city_data(DataLength)])

# Augment French cities data
french_cities_augmented = french_cities_data.copy()
french_cities_augmented["CityID"].extend(range(6, DataLength+6))
french_cities_augmented["CityName"].extend([city_data["CityName"] for city_data in generate_city_data(DataLength)])
french_cities_augmented["Population"].extend([city_data["Population"] for city_data in generate_city_data(DataLength)])
french_cities_augmented["Area"].extend([city_data["Area"] for city_data in generate_city_data(DataLength)])
french_cities_augmented["CulturalSites"].extend([city_data["CulturalSites"] for city_data in generate_city_data(DataLength)])


In [24]:
# #MAKE DATAFRAMES# Create DataFrames
# df_german_cities = pd.DataFrame(german_cities_data)
# df_french_cities = pd.DataFrame(french_cities_data)

# # Save to CSV files
# df_german_cities.to_csv('german_cities.csv', index=False)
# df_french_cities.to_csv('french_cities.csv', index=False) 

In [25]:
frenchpath=r"C:\Users\alexw\Desktop\BDG3_FTP\Group_Assignment\french_cities.csv"
Germpath=r"C:\Users\alexw\Desktop\BDG3_FTP\Group_Assignment\german_cities.csv"

In [26]:
spark = SparkSession.builder \
    .appName("Assignment 1") \
    .getOrCreate()

In [27]:
F_schema = StructType([
    StructField("CityID", IntegerType(), nullable=False),
    StructField("CityName", StringType(), nullable=False),
    StructField("Population", IntegerType(), nullable=False),
    StructField("Area", DoubleType(), nullable=False),
    StructField("CulturalSites", IntegerType(), nullable=False)
])

french_cities = spark.read.format("csv").load(frenchpath, schema=F_schema)

In [28]:
G_schema = StructType([
    StructField("CityID", IntegerType(), nullable=False),
    StructField("CityName", StringType(), nullable=False),
    StructField("Population", IntegerType(), nullable=False),
    StructField("Area", DoubleType(), nullable=False),
    StructField("CulturalSites", IntegerType(), nullable=False)
])

german_cities = spark.read.format("csv").load(Germpath, schema=G_schema)

In [25]:

french_city_sizes = french_cities.select("CityName", "Area", "Population").show()

+---------+-------+----------+
| CityName|   Area|Population|
+---------+-------+----------+
| CityName|   null|      null|
|    Paris|  250.0|   2250000|
|Marseille|  240.6|    851000|
|     Lyon|  47.87|    491000|
| Toulouse|  118.3|    447000|
|     Nice|   71.9|    344000|
|   GCnYfE|4075.76|   1937534|
|     gmTA|3160.19|    550832|
|     tXCA|3468.78|   1190719|
|    AtawC| 319.06|    942912|
|     sjGV|1467.29|   4336033|
|    aqLRH|1423.17|     55182|
|   dzeZMc|1652.88|    903652|
|   RWIKss|4238.86|   1029760|
|    rqynV|3289.39|   1666656|
|    jiEHg|3176.19|   4697399|
|   wxOfMT|2726.65|   4307211|
|   bBUVtO| 3098.1|   2872468|
|    Qtzzi|3750.59|    174577|
|   XoDQVr|2859.59|   4894737|
+---------+-------+----------+
only showing top 20 rows



In [30]:
germ_city_sizes = german_cities.select("CityName", "Area", "Population").show()

+---------+-------+----------+
| CityName|   Area|Population|
+---------+-------+----------+
| CityName|   null|      null|
|   Berlin|  891.3|   3520000|
|  Hamburg|  755.2|   1790000|
|   Munich|  310.7|   1450000|
|  Cologne| 405.15|   1060000|
|Frankfurt|  500.0|    733000|
|     tLTT|1684.85|   4612679|
|     posU|4686.78|   3629196|
|   Nwvngc| 361.22|   2356514|
|     lznY|4321.89|   1316886|
|     isVB|2166.49|   3141122|
|    uDyYV|4990.76|   2318253|
|     vxDu|1144.51|   1286493|
|    gYbWQ|2191.29|    914876|
|   jDlwYd| 3641.3|   3949795|
|     raYl|4461.73|     33738|
|   uJVunV| 519.79|    629804|
|   qHIceI| 4914.2|   4200206|
|   BzhvJZ|4395.65|   3218325|
|    coBNe|2462.85|   2977006|
+---------+-------+----------+
only showing top 20 rows



In [11]:
#ALEX Q6
from pyspark.sql.functions import avg
german_avg = german_cities.select(
    lit("German").alias("Country"),
    avg("CityID").alias("Avg_CityID"),
    avg("Population").alias("Avg_Population"),
    avg("Area").alias("Avg_Area"),
    avg("CulturalSites").alias("Avg_CulturalSites")
)

french_avg = french_cities.select(
    lit("French").alias("Country"),
    avg("CityID").alias("Avg_CityID"),
    avg("Population").alias("Avg_Population"),
    avg("Area").alias("Avg_Area"),
    avg("CulturalSites").alias("Avg_CulturalSites")
)

# Combine the averages into a single dataframe
combined_avg = german_avg.union(french_avg)

# Show the combined averages
combined_avg.show()

+-------+----------+------------------+------------------+------------------+
|Country|Avg_CityID|    Avg_Population|          Avg_Area| Avg_CulturalSites|
+-------+----------+------------------+------------------+------------------+
| German|    1003.0|2471025.1905236905|2565.4415012468817| 4.436907730673317|
| French|    1003.0|2474832.3685785537|2593.9251970074815|4.5042394014962595|
+-------+----------+------------------+------------------+------------------+

