In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from matplotlib import cm
from matplotlib.colors import Normalize
import matplotlib.pyplot as plt
import geopandas as gpd
import numpy as np
from matplotlib.colors import Normalize
import matplotlib.patches as mpatches

In [None]:
plt.rcParams['figure.dpi'] = 200  # rozdzielczosc wykresow

In [None]:
where_she_was = pd.read_csv("processed/tour.csv")

In [None]:
plt.plot(sorted(where_she_was["Population"].tolist()))
plt.xlabel('Miasta według populacji')
plt.ylabel('Populacja [mln]')
plt.title('Rozkład populacji miast')
plt.savefig('results/Rozkład populacji miast.png', dpi=300)
plt.show()

In [None]:
world = gpd.read_file("original/110m_cultural.zip")
world.plot(figsize=(10, 6))

a = np.array(where_she_was['Longitude'])
b = np.array(where_she_was['Latitude'])

sizes = where_she_was['Population'] / 50000
plt.scatter(a, b, s=sizes, c="yellow", alpha=0.7, edgecolors='w')
plt.axis('off')

# Add a title
plt.title('Mapa świata z zaznaczonymi faktycznymi miastami')

plt.savefig('results/Mapa świata z zaznaczonymi faktycznymi miastami.png', dpi=300)

# # Get the indices of the top 10 largest cities
# top_10_indices = where_she_was['Population'].nlargest(10).index

# # Annotate the top 10 largest cities on the plot
# texts = []
# for i in top_10_indices:
#     texts.append(plt.text(a[i], b[i], where_she_was['City'][i], fontsize=8))

# # Adjust the positions of annotations to avoid overlap
# adjust_text(texts, arrowprops=dict(arrowstyle='->', color='black'))

plt.show()

In [None]:
cities = pd.read_csv("clean/cities.csv")[["Country name EN", "Name", "Population", "Latitude", "Longitude"]]
cities.rename(columns={"Country name EN": "Country", "Name": "City"}, inplace=True)
cities["Number of concerts"] = 0
cities["Attendance"] = round(where_she_was["Attendance"].mean())
where_she_was_not = cities[~cities["City"].isin(where_she_was["City"])]
where_she_was_not_sampled = where_she_was_not.sample(n=200, weights=where_she_was_not["Population"], random_state=42069)
where_she_was_not_sampled

In [None]:
world = gpd.read_file("original/110m_cultural.zip")
world.plot(figsize=(10, 6))

a = np.array(where_she_was_not_sampled['Longitude'])
b = np.array(where_she_was_not_sampled['Latitude'])

sizes = where_she_was_not_sampled['Population'] / 50000
plt.scatter(a, b, s=sizes, c="yellow", alpha=0.7, edgecolors='w')
plt.axis('off')

# Add a title
plt.title('Mapa świata z zaznaczonymi wylosowanymi miastami')

plt.savefig('results/Mapa świata z zaznaczonymi wylosowanymi miastami.png', dpi=300)

plt.show()

In [None]:
# Comparison if cities where she was not were sampled properly
# - curves are similar, populations are in the same order of magnitude

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 6), sharex=False)

ax1.plot(sorted(where_she_was["Population"].tolist()))
ax2.plot(sorted(where_she_was_not_sampled["Population"].tolist()))

# Adding labels to the axes for both plots
ax2.set_xlabel('Miasta w według populacji')
fig.text(0.04, 0.5, 'Populacja [mln]', va='center', rotation='vertical')

# Setting title above both plots
plt.suptitle('Porównanie rozkładu populacji miast oryginalnych i próbki losowej', y=0.95)

plt.savefig('results/Porównanie rozkładu populacji miast oryginalnych i próbki losowej.png', dpi=300)


plt.show()

In [None]:

joined_dataset = pd.concat([where_she_was, where_she_was_not_sampled], axis=0).reset_index(drop=True)[["Latitude", "Longitude", "Country", "Population", "Attendance", "Number of concerts"]]

joined_dataset

In [None]:
X = joined_dataset[["Latitude", "Longitude", "Country", "Population", "Attendance"]]
y = joined_dataset["Number of concerts"]

categorical_cols = ["Country"]

print("Features")
print(X)
print("Target - number of concerts")
print(y)

In [None]:
all_data = pd.concat([where_she_was, where_she_was_not], axis=0).reset_index(drop=True)
all_data

In [None]:
# convert each category to column
encoder = OneHotEncoder(handle_unknown='ignore')
ct = ColumnTransformer([('encoder', encoder, categorical_cols)], remainder='passthrough')
all_data_X = all_data[["Latitude", "Longitude", "Country", "Population", "Attendance"]]
ct.fit(all_data_X)
X_encoded = ct.transform(X)

indices = np.arange(X.shape[0])

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X_encoded, y, indices, test_size=0.2, random_state=8)

model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Clip predictions to be non-negative
predictions = np.maximum(predictions, 0)

df_test = joined_dataset.loc[indices_test]
df_test['Number of concerts (predicted)'] = predictions

In [None]:
# evaluate model
mse = mean_squared_error(y_test, predictions)

print("MEAN SQUARED ERROR (number of days)", mse)

# Plot actual data and predictions
x_series = df_test["Population"]

plt.scatter(x_series, y_test, color='blue', label='Dane faktyczne')
plt.scatter(x_series, predictions, color='red', label='Predykcje')

# Set x-axis to be logarithmic
plt.xscale('log')

# Add labels to the axes
plt.xlabel('Miasta według populacji [mln]')
plt.ylabel('Liczba dni koncertowych')

# Add a title
plt.title('Porównanie danych faktycznych z przewidywaniami')

# Add a legend
plt.legend()

plt.savefig('results/Porównanie danych faktycznych i predykcji.png', dpi=300)

plt.show()

In [None]:
# # Fit linear regression model
# model = LinearRegression()
# model.fit(np.log(x_series.values.reshape(-1, 1)), y_test)

# # Predictions using the model
# predictions_reg = model.predict(np.log(x_series.values.reshape(-1, 1)))

# # Plot actual data and predictions
# plt.scatter(x_series, y_test, color='blue', label='Dane faktyczne')
# plt.scatter(x_series, predictions, color='red', label='Predykcje')

# # Plot regression line with color purple
# plt.plot(x_series, predictions_reg, color='grey', label='Regression Line')

# # Set x-axis to be logarithmic
# plt.xscale('log')

# # Add labels to the axes
# plt.xlabel('Miast w kolejności według populacji [mln]')
# plt.ylabel('Liczba dni koncertowych')

# # Add a title
# plt.title('Porównanie danych faktycznych i predykcji oraz linia regresji')

# # Add a legend
# plt.legend()

# # Set x-axis tick labels to non-scientific notation
# plt.gca().xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: '{:.0f}'.format(x)))

# plt.savefig('results/Porównanie danych faktycznych i predykcji oraz linia regresji.png', dpi=300)

# plt.show()


In [None]:
all_data_X = all_data[["Latitude", "Longitude", "Country", "City", "Population", "Attendance"]]

new_cities = all_data_X.sample(n=200, weights=all_data_X["Population"], random_state=42069).reset_index(drop=True)
new_cities_encoded = ct.transform(new_cities[["Latitude", "Longitude", "Country", "Population", "Attendance"]])

preds = model.predict(new_cities_encoded)
new_cities["Number of concerts (predicted)"] = np.maximum(preds, 0)

new_cities["Number of concerts (predicted)"].describe()


In [None]:
world = gpd.read_file("original/110m_cultural.zip")
world.plot(figsize=(10, 6))

a = np.array(new_cities['Longitude'])
b = np.array(new_cities['Latitude'])

sizes = new_cities['Population'] / 100_000

cmap = cm.get_cmap('RdYlGn')  # Red to yellow to green colormap
norm = Normalize(vmin=0, vmax=6)
colors = cmap(norm(new_cities['Number of concerts (predicted)']))

plt.scatter(a, b, s=sizes, c=colors, alpha=0.7, edgecolors='w', cmap='RdYlGn')

sm = plt.cm.ScalarMappable(cmap='RdYlGn', norm=norm)
sm.set_array([])  # dummy to remove the error
cbar = plt.colorbar(sm, ax=plt.gca())
plt.axis('off')

# Add a title
plt.title('Mapa świata z zaznaczoną predykcją liczby dni koncertowych')

plt.savefig('results/Mapa świata z zaznaczoną predykcją liczby dni koncertowych.png', dpi=300)


plt.show()

In [None]:
# Load world map
world = gpd.read_file("original/110m_cultural.zip")

# Create subplot
fig, ax = plt.subplots(figsize=(12, 6), ncols=2, gridspec_kw={'width_ratios': [10, 1]})

# Plot world map
world.plot(ax=ax[0])

# Extract longitude and latitude
a = np.array(new_cities['Longitude'])
b = np.array(new_cities['Latitude'])

# Determine marker sizes based on population
sizes = new_cities['Population'] / 100_000

# Define custom legend labels
legend_labels = ['0', '1', '2', '3', '4', '5', '6']

# Generate colors for legend
cmap = cm.get_cmap('RdYlGn')  # Red to yellow to green colormap
norm = Normalize(vmin=0, vmax=6)
legend_colors = [cmap(norm(i)) for i in range(7)]  # Lista kolorów odpowiadających gradientowi

# Plot scatter plot with colors
scatter = ax[0].scatter(a, b, s=sizes, c=new_cities['Number of concerts (predicted)'], cmap='RdYlGn', alpha=0.7, edgecolors='w', norm=norm)

# Create custom legend
legend_patches = [mpatches.Patch(color=color, label=label) for color, label in zip(legend_colors, legend_labels)]

# Add custom legend to the plot
legend = ax[1].legend(handles=legend_patches, title='Liczba dni koncertowych', title_fontsize='large', loc='center')
ax[1].axis('off')

# Add a title
ax[0].set_title('Mapa świata z zaznaczoną predykcją liczby dni koncertowych')
ax[0].axis('off')

plt.tight_layout()

plt.savefig('results/Mapa świata z zaznaczoną predykcją liczby dni koncertowych.png', dpi=300)

plt.show()

In [None]:
plt.scatter(x=new_cities["Population"], y=new_cities["Number of concerts (predicted)"])
# Add a title
plt.title('Zależność między populacją miast a przewidywaną liczbą dni koncertowych')

# Add labels to the axes
plt.xlabel('Populacja miast [mln]')
plt.ylabel('Przewidywana liczba dni koncertowych')


plt.savefig('results/Zależność między populacją miast a przewidywaną liczbą dni koncertowych.png', dpi=300)

plt.show()

In [None]:
new_cities_rounded = new_cities.copy()
new_cities_rounded["Number of concerts (predicted)"] = new_cities["Number of concerts (predicted)"].round().astype("int32")
new_cities_rounded = new_cities_rounded[new_cities_rounded["Number of concerts (predicted)"] > 0]
new_cities_rounded = new_cities_rounded[new_cities_rounded["Number of concerts (predicted)"].cumsum() < 150]

locations = new_cities_rounded['City'] + ', ' + new_cities_rounded['Country'] + " - " + new_cities_rounded['Number of concerts (predicted)'].astype(str) + " days"

print("\n".join(locations.tolist()))

In [None]:
# Stworzenie DataFrame z kolumną locations
locations_df = pd.DataFrame(locations, columns=['Locations'])

# Zapisanie do pliku CSV
locations_df.to_csv('results/locations_table.csv', index=False)

# Możesz również zapisać do innego formatu, np. Excel, używając:
# locations_df.to_excel('locations_table.xlsx', index=False)