In [1]:
import pandas as pd

from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))
# Don't wrap repr(DataFrame) across additional lines
pd.set_option("display.expand_frame_repr", True)

# Set max rows displayed in output to 25
pd.set_option("display.max_rows", 25)
%matplotlib inline
%matplotlib widget

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# ASK WIKIPEDIA FOR LIST OF COMPANIES
# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """#List of `instances of` "business enterprise"
SELECT ?com ?comLabel ?inception ?industry ?industryLabel ?coordinate ?country ?countryLabel WHERE {
  ?com (wdt:P31/(wdt:P279*)) wd:Q4830453;
    wdt:P625 ?coordinate.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
  OPTIONAL { ?com wdt:P571 ?inception. }
  OPTIONAL { ?com wdt:P452 ?industry. }
  OPTIONAL { ?com wdt:P17 ?country. }
}"""

def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

results = get_results(endpoint_url, query)

for result in results["results"]["bindings"]:
    print(result)

In [None]:
#PUT THE DATA ON THE RIGHT FORMAT into pandas
import os
import json
import pandas as pd
from pandas.io.json import json_normalize

# Get the dataset, and transform string into floats for plotting
dataFrame = pd.json_normalize(results["results"]["bindings"]) #in a serialized json-based format
df = pd.DataFrame(dataFrame) # into pandas
p = r'(?P<latitude>-?\d+\.\d+).*?(?P<longitude>-?\d+\.\d+)' #get lat/lon from string coordinates
df[['longitude', 'latitude']] = df['coordinate.value'].str.extract(p, expand=True)
df['latitude'] = pd.to_numeric(df['latitude'], downcast='float')
df['longitude'] = pd.to_numeric(df['longitude'], downcast='float')
data = pd.DataFrame(df, columns = ['latitude','longitude','comLabel.value','coordinate.value', 'inception.value', 'industryLabel.value', 'com.value', 'industry.value', 'country.value','countryLabel.value'])
data=data.dropna(subset=['latitude', 'longitude'])
data.rename(columns={'comLabel.value':'company'}, inplace=True)
data.rename(columns={'coordinate.value':'coordinate'}, inplace=True)
data.rename(columns={'inception.value':'inception'}, inplace=True)
data.rename(columns={'industryLabel.value':'industry'}, inplace=True)
data.rename(columns={'com.value':'id'}, inplace=True)
data.rename(columns={'industry.value':'id_industry'}, inplace=True)
data.rename(columns={'country.value':'id_country'}, inplace=True)
data.rename(columns={'countryLabel.value':'country'}, inplace=True)
data = pd.DataFrame (data) #cluster maps works ONLY with dataframe
print(data.shape)
print(data.sample(5))
print(data.info())

In [None]:
#DATA index cleaning
from sqlalchemy import create_engine
from pandas.io import sql
import re

IDs=[]
for name in data['id']:
    ID_n = name.rsplit('/', 1)[1]
    ID = re.findall('\d+', ID_n)
    #print(ID[0], ID_n)
    IDs.append(ID[0])
data ['ID'] = IDs
print (data['ID'].describe())
data['ID']= data['ID'].astype(int)
#print (data['ID'].describe())
data.rename(columns={'id':'URL'}, inplace=True)
data['company_foundation'] = data['inception'].str.extract(r'(\d{4})')
pd.to_numeric(data['company_foundation'])
data = data.set_index(['ID'])
print(data.columns)

In [None]:
#GET company-industry relationship data
industries = data.dropna(subset=['id_industry'])
#print(industries)

industries.groupby('id_industry')[['company', 'country']].apply(lambda x: x.values.tolist())
print(industries.info())

industries = pd.DataFrame (industries)
print(industries.sample(3))

In [None]:
IDs=[]
for name in industries['id_industry']:
    ID_n = name.rsplit('/', 1)[1]
    ID = re.findall('\d+', ID_n)
#    print(ID, ID_n)
    IDs.append(ID[0])
    
industries ['ID_industry'] = IDs
industries['ID_industry']= industries['ID_industry'].astype(int)
industries.set_index([industries.index, 'ID_industry'], inplace=True)
industries['id_wikipedia']=industries['id_industry']
industries.drop('id_industry', axis=1, inplace=True)  

industries = pd.DataFrame(industries)
print(industries.info())
print(industries.sample(3))

In [None]:
import plotly.express as px
import plotly.io as pio

px.defaults.template = "ggplot2"
px.defaults.color_continuous_scale = px.colors.sequential.Blackbody
#px.defaults.width = 600
#px.defaults.height = 400

#data = data.dropna(subset=['country'])

fig = px.scatter(data.dropna(subset=['country']), x="latitude", y="longitude", color="country")# width=400)
fig.show()
#break born into quarters and use it for the x axis; y has number of companies;

#fig = px.density_heatmap(countries_industries, x="country", y="companies", template="seaborn")
fig = px.density_heatmap(data, x="latitude", y="longitude")#, template="seaborn")
fig.show()

In [None]:
#COMPANIES IN COUNTRIES
fig = px.histogram(data.dropna(subset=['country', 'industry']), x="country",
                   title='COMPANIES IN COUNTRIES',
                  # labels={'industry':'industries'}, # can specify one label per df column
                   opacity=0.8,
                   log_y=False, # represent bars with log scale
                  # color_discrete_sequence=['indianred'], # color of histogram bars
                   color='industry',
                 #  marginal="rug", # can be `box`, `violin`
                #   hover_data="companies"
                   barmode='overlay'
                   )
fig.show()

#INDUSTRIES IN COUNTRIES
fig = px.histogram(data.dropna(subset=['industry', 'country']), x="industry",
                   title='INDUSTRIES IN COUNTRIES',
                  # labels={'industry':'industries'}, # can specify one label per df column
                   opacity=0.8,
                   log_y=False, # represent bars with log scale
                  # color_discrete_sequence=['indianred'], # color of histogram bars
                   color='country',
                 #  marginal="rug", # can be `box`, `violin`
                #   hover_data="companies"
                   barmode='overlay'
                   )
fig.show()

In [None]:
#THIS IS THE 2D MAP I COULD FIND, :)
import plotly.graph_objects as go
data['text'] = 'COMPANY: '+ data['company'] + '<br>COUNTRY: ' + data['country'] + '<br>FOUNDATION: ' + data['company_foundation'].astype(str)

fig = go.Figure(data=go.Scattergeo(
        locationmode = 'ISO-3',
        lon = data['longitude'],
        lat = data['latitude'],
        text = data['text'],
        mode = 'markers',
        marker = dict(
            size = 3,
            opacity = 0.8,
            reversescale = True,
            autocolorscale = False,
            symbol = 'square',
            line = dict(width=1, color='rgba(102, 102, 102)'),
         #   colorgroup='country'
         #   colorscale = 'Blues',
          #  cmin = 0,
          #  color = df['cnt'],
          #  cmax = df['cnt'].max(),
          #  colorbar_title="Incoming flights<br>February 2011"
        )))

fig.update_layout(
        title = 'Companies of the World<br>',
        geo = dict(
            scope='world',
          #  projection_type='albers usa',
            showland = True,
            landcolor = "rgb(250, 250, 250)",
            subunitcolor = "rgb(217, 217, 217)",
            countrycolor = "rgb(217, 217, 217)",
            countrywidth = 0.5,
            subunitwidth = 0.5
        ),
    )
fig.show()

In [None]:
print(data.info())
import tkinter as tk
from tkinter import filedialog
from pandas import DataFrame

root= tk.Tk()
canvas1 = tk.Canvas(root, width = 300, height = 300, bg = 'lightsteelblue2', relief = 'raised')
canvas1.pack()

def exportCSV ():
    global df
    
    export_file_path = filedialog.asksaveasfilename(defaultextension='.csv')
    data.to_csv (export_file_path, index = True, header=True)

saveAsButton_CSV = tk.Button(text='Export CSV', command=exportCSV, bg='green', fg='white', font=('helvetica', 12, 'bold'))
canvas1.create_window(150, 150, window=saveAsButton_CSV)

root.mainloop()