In [145]:
"""
name, address, country, coordinates, job, company, current_location, birthday,
catch_phrase, word, words, (characteristic_features), latitude, longitude

1165 total criminals:

2019/10/30
Purpose: to generate a table with fake data
"""
from datetime import datetime
# import random
import numpy as np
import pandas as pd
from faker import Faker

from faker.generator import random

Faker.seed(4321)
faker = Faker()



main_dict = {'France': {'number_of_agents': 324, 'city': 'Paris',
                    'city_coordinates': ('48.8566', '2.3522'),
                        'faker_abbrev': 'fr_FR'},
            'United Kingdom': {'number_of_agents': 450, 'city': 'London',
                    'city_coordinates': ('51.5074', '-0.1278'),
                               'faker_abbrev': 'en_GB'},
            'Germany': {'number_of_agents': 251, 'city': 'London',
                    'city_coordinates': ('50.1109', '8.6821'),
                        'faker_abbrev': 'de_DE'},
            'Netherlands': {'number_of_agents': 140, 'city': 'Amsterdam',
             'city_coordinates': ('52.3667', '4.8945'), 'faker_abbrev':
                                'nl_NL'}
            }



def parse_main_dict():
    """Parses dict to get the lists of
    countries, cities, and fakers. Fakers allow generation of region specific fake data.
    Also generates total number of agents
    """
    Faker.seed(4321)
    
    countries = main_dict.keys()
    cities = [v['city'] for v in main_dict.values()]
    fakers = [Faker(v['faker_abbrev']) for v in main_dict.values()]
    total_agents = sum([v['number_of_agents'] for v in main_dict.values()])


    return fakers, countries, cities, total_agents


def generate_lat_lon(country, main_dict):
    """
    Generates latitude and longitude for a city in the country. 
    The values have a defined range of randomness near
    the city (the coordinates of the city are from the main_dict).
    """
    lats = []
    lons = []
    n_agents = main_dict[country]['number_of_agents']
    print("n_agents: ", n_agents)
    for i in range(n_agents):
        lat = float(main_dict[country]['city_coordinates'][0])
        lon = float(main_dict[country]['city_coordinates'][1])
        dev = 0.25
        min_lat, max_lat = lat - dev, lat + dev
        min_lon, max_lon = lon - dev, lon + dev
        round_to = 4
        lat1 = round(random.uniform(min_lat, max_lat), round_to)
        lon1 = round(random.uniform(min_lon, max_lon), round_to)
        lats.append(str(lat1))
        lons.append(str(lon1))
    
    return lats, lons


def generate_aliases(n_agents):
    """"""
    with open('nicknames_str.txt', 'r') as f:
        aliases = [i.capitalize() for i in f.read().split()]
                    
    aliases_unique = list(set(aliases))
    aliases = aliases_unique + [None for i in range(n_agents - len(aliases_unique))]
    random.shuffle(aliases)
    
    return aliases


def generate_country_criminals_df(regional_faker, country):

    """
    Creates a dataframe with columns: name, nickname, address, city, 
    country, latitude, longitude.
    """

    n_agents = main_dict[country]['number_of_agents']
    
    name_col = [regional_faker.name() for i in range(n_agents)]
    address_col = [regional_faker.address().replace('\n', ' ') for i in
                   range(n_agents)]

    lats_col, lons_col = generate_lat_lon(country, main_dict)
    cols_data = [name_col, address_col, lats_col, lons_col]
    
    column_names = ['name', 'address', 'lat', 'lon']
         
    country_criminals_dict = {col: data for (col, data) in zip(column_names, cols_data)}
    df = pd.DataFrame(country_criminals_dict)
    
    # add columns of country and city
    df['country'] = country
    df['city'] = main_dict[country]['city']
    df['id'] = df.index
    
    df = df[['name', 'id', 'address', 'lat', 'lon', 'country', 'city']]

    return df


def add_date_column(df):
    """Creates random dates within the last year. """
    faker_en = Faker('en_GB')
    
    def date_using_faker(x):
        return faker_en.date_between(start_date='-360d', end_date='today')

    df['date'] = df['name'].apply(date_using_faker)  # can use any column; here we are using 'name'
    df['date'] = df['date'].astype('datetime64')
    
    return df


def add_date_not_sunday(value):
    """Create fake date that is not a Sunday"""
    def weekday(date):
        """ input 'date' - datetime object or datetime64"""
        # uncomment line before if the date passed is in string format; 
        # date = datetime.strptime(date, "%Y-%m-%d")  # change the format if necessary
        return date.strftime("%A")
    
    faker_en = Faker('en_GB')
    fake_date = faker_en.date_between(start_date='-360d', end_date='today')
    
    # fake_date = date_using_faker('mock_arg')
    while weekday(fake_date) == 'Sunday': 
        fake_date = faker_en.date_between(start_date='-360d', end_date='today')
    print("fake_date: {}".format(fake_date))
    return fake_date


def add_moriarty_profile(df):
    """Set date to be NOT Sunday and alias as np.nan."""
    faker_en = Faker('en_GB')

    df_moriarty = df.loc[(df.country == "United Kingdom") & (df.crime_type == "weapons sale")] \
                        .sort_values('profit', ascending = False).reset_index()
    hidden_moriarty_name = df_moriarty.name.tolist()[0]
    df_moriarty = df.loc[df.name == hidden_moriarty_name]
    df_moriarty = df_moriarty.copy()
    
    print("Df before moriarty creations shape: {}".format(df.shape[0]))
    print(df.loc[df.name == hidden_moriarty_name])
    
    df_not_moriarty = df.loc[df.name != hidden_moriarty_name]
    
    df_moriarty["date"] = add_date_not_sunday('test')
    df_moriarty["date"] = df_moriarty["date"].astype('datetime64')
    df_moriarty['alias'] = None
    
    df = pd.concat([df_not_moriarty, df_moriarty])
    print("Df after moriarty creations shape: {}".format(df.shape[0]))
    print(df.loc[df.name == hidden_moriarty_name])
    
    return df


def generate_crime_types(df):
    """
    Generate a column of crime types  for all criminals.
    Each type has its own defined fraction of all criminals to ensure
    weapons sales has the most sales (in money units) and 
    that other crime types profits look realistic.
    
    """
    
    crimes_dict = {'weapons sale': {'factor':100.0, 'fraction': 0.05}, 
               'drug sale': {'factor':9.0, 'fraction': 0.08}, 
               'robbery': {'factor':0.2, 'fraction': 0.17}, 
               'forgery': {'factor':0.12, 'fraction': 0.10},
               'theft':{'factor':0.08, 'fraction': 0.4},
               'pickpocketing': {'factor':0.01, 'fraction': 0.2}
              }
    
    full_crimes_list = []
    for k in list(crimes_dict.keys()):
        if k != list(crimes_dict.keys())[-1]:
            times = int(df.shape[0]*crimes_dict[k]['fraction'])
        else:
            times = df.shape[0] - len(full_crimes_list)
        crimes = [k for i in range(times)]
        full_crimes_list += crimes

    print("df shape0: ", df.shape[0])

    random.shuffle(full_crimes_list)  
    print(len(full_crimes_list))
    print(full_crimes_list[0:10])
    df.loc[:, 'crime_type'] = full_crimes_list

    
    
    def generate_criminal_profits(crime_type):
        """
        Adds profit information for each criminal.
        Factor from the dict is used to make crime types profits look realistic
        """
        return int(-( crimes_dict[crime_type]['factor'] * (-random.randrange(100, 5000, 10)) // 1))

    df['profit'] = df['crime_type'].apply(generate_criminal_profits)
    
    return df


def split_main_df_into_countries(df):
    """
    Split the main df into 5 dfs based on the country for unioning in the test.
    Include columns: name, alias, lat, lon.
    (with names specific to the corresponding countries).
    Save to the current folder.
    """
    df_all_countries = df.copy()

    country_list = df_all_countries["country"].unique().tolist()
    
    dfs_dict = {}
    for num, country_ in enumerate(country_list):
        # print(country_)
        df_country = df_all_countries.loc[(df_all_countries.country == country_)]
        df = df_country.copy()
        df = df[["id", "name", "alias", "lat", "lon"]]
        
        # add country-specific column names
        if country_ == "Germany" or country_ == "Netherlands":
            cols = ["id", "benennen", "aliasnamen", "breitengrad", "länge"]
        elif country_ == "France":
            cols = ["id", "nom", "pseudonyme", "latitude", "longitude"]
        else:
            cols = ["id", "name", "alias", "latitude", "longitude"]
            
        # assign column names
        df.columns = cols
        # save as file with country name
        file_name = "criminals_{}.csv".format(country_)
        print("Saved: {}".format(file_name))
        df.to_csv(file_name, header = True, index = False)
        
        # generate crime_type, profit csvs
        df = df_country.copy()
        df = df[["name", "crime_type", "profit"]]
        file_name = "crime_type_profit_{}.txt".format(country_)
        print("Saved: {}".format(file_name))
        df.to_csv(file_name, sep=' ', header = True, index = False)
        
        

def save_id_date_df(df):
    """
    Add date of the last crime. Leave only 'id' and 'date' (for joining on id)
    """

    df_id_date = df[["id", "date", "country"]]
    file_name = "id_dates.csv"
    df_id_date.to_csv(file_name, header = True, index=False)
    print("Saved: {}".format(file_name))

    
def add_weekday_column(df):
    """"""
    def weekday(date):
        """ input 'date' - datetime object or datetime64"""
        return date.strftime("%A")

    df["weekday"]= df["date"].apply(weekday)
    
    return df
    
    
def identify_top_country(df):
    """"""
    df_ws_top = df.loc[df["crime_type"] == "weapons sale"].\
                            groupby(["country"]).\
                            agg({"profit": "sum"}).\
                            sort_values('profit', ascending = False).\
                            reset_index()

    country_with_top_weapons_sales = df_ws_top.country.tolist()[0]
    
    return country_with_top_weapons_sales


def identify_moriarty(df, country):
    """"""
    # select top 5 weapons sales records(rows) among those who don't sell on Sunday
    df = df.loc[(df.country == country_with_top_ws) & 
                                  (df.weekday != 'Sunday')].\
                            sort_values('profit', ascending = False).reset_index()

    moriarty_name = df.name[0]
    
    return moriarty_name


def get_correct_answer(df, save=True):
    """"""
    top_country = identify_top_country(df)
    #
    df_with_weekday = add_weekday_column(df)
    moriarty_name = identify_moriarty(df, top_country)
    
    if save:
        with open('correct_answer.txt', 'w') as f:
            f.write(moriarty_name)
            
    return moriarty_name
    
    

def generate_df_with_criminals():
    """
    Creates a pandas dataframe of criminals for all countries.
    1.Generates aliases.
    2. Iterates over countries to create dataframe of criminals per country.
    3. Combines all country dfs into the main dataframe.
    """
    fakers, countries, cities, n_agents = parse_main_dict()
    
                    
    country_criminals_dfs = []
    for faker, country in zip(fakers, countries):
        df = generate_country_criminals_df(faker, country)
        country_criminals_dfs.append(df)
              
    df = pd.concat(country_criminals_dfs, ignore_index=True, sort=False)
    aliases = generate_aliases(n_agents)
    df['alias'] = pd.Series(aliases)

    print("Initial shape: {}".format(df.shape[0]))
    print(df.columns.tolist())
    df.drop_duplicates(subset=['name'], inplace=True)
    print("Initial shape2: {}".format(df.shape[0]))
    print(df.columns.tolist())

    df = add_date_column(df)
    print("Final shape: {}".format(df.shape[0]))
    print(df.columns.tolist())
    df = generate_crime_types(df)

    print(df.columns.tolist())
    
    df = df[['name', 'alias', 'id', 'address', 'lat', 'lon', 'country', \
             'city', 'date', 'crime_type', 'profit']]

    df_copy = df.copy()
    df = add_moriarty_profile(df_copy)
    split_main_df_into_countries(df)  # saves csvs
    
    save_id_date_df(df)  # save id and date for future join

    return df


main_criminals_df = generate_df_with_criminals()
main_criminals_df.sort_values("name").head(5)
      
#generated_criminals_df.to_csv('criminals.csv', header=True, index=False)


n_agents:  324
n_agents:  450
n_agents:  251
n_agents:  140
Initial shape: 1165
['name', 'id', 'address', 'lat', 'lon', 'country', 'city', 'alias']
Initial shape2: 1164
['name', 'id', 'address', 'lat', 'lon', 'country', 'city', 'alias']
Final shape: 1164
['name', 'id', 'address', 'lat', 'lon', 'country', 'city', 'alias', 'date']
df shape0:  1164
1164
['forgery', 'theft', 'theft', 'theft', 'pickpocketing', 'pickpocketing', 'pickpocketing', 'forgery', 'forgery', 'theft']
['name', 'id', 'address', 'lat', 'lon', 'country', 'city', 'alias', 'date', 'crime_type', 'profit']
Df before moriarty creations shape: 1164
                name alias   id                                       address  \
739  Mr. James Evans  None  415  Flat 32 Stuart prairie Lake Melissa EC32 3JQ   

         lat      lon         country    city       date    crime_type  profit  
739  51.6388  -0.1794  United Kingdom  London 2020-03-26  weapons sale  456000  
fake_date: 2020-09-18
Df after moriarty creations shape: 116

Unnamed: 0,name,alias,id,address,lat,lon,country,city,date,crime_type,profit
1125,Aaron de Strigter,,100,Fembaan 916 1389DU Godlinze,52.1659,4.8324,Netherlands,Amsterdam,2020-05-09,weapons sale,451000
463,Abbie Jones,,139,Flat 45 Elliott knolls Fowlerberg HX7W 9TQ,51.5988,-0.3318,United Kingdom,London,2020-10-30,weapons sale,102000
772,Abbie Quinn,Lil’,448,Studio 1 Moore plaza Lake Chelsea WA69 8HF,51.4066,-0.1305,United Kingdom,London,2020-11-25,theft,168
507,Abdul Rhodes,Brandy,183,Flat 00L Slater tunnel Watsonshire M89 0BZ,51.4636,-0.1,United Kingdom,London,2020-08-08,theft,336
590,Abdul Wilson-Ryan,,266,Studio 45 Nolan circles Grantbury M0 2AA,51.2687,-0.259,United Kingdom,London,2020-10-26,robbery,892


In [61]:
# get_correct_answer(df, save=True)
moriarty_name2 = get_correct_answer(main_criminals_df)
print(moriarty_name2)
with open('puzzle.txt') as f:
    puzzle_text = f.read()
    
print(puzzle_text.format(moriarty_name2))

Mr. James Evans
Watson this is text. The top country is Mr. James Evans.


In [67]:
df = pd.DataFrame({'date': ['2020-10-08', '2020-10-08', '2020-10-08']})
df.date = df.date.astype("datetime64")
aliases = ['a', 'b', 'c']
df['alias'] = pd.Series(aliases)
df

Unnamed: 0,date,alias
0,2020-10-08,a
1,2020-10-08,b
2,2020-10-08,c


In [118]:
def add_weekday_column(df):
    """"""
    def weekday(date):
        """ input 'date' - datetime object or datetime64"""
        return date.strftime("%A")

    df["weekday"]= df["date"].apply(weekday)
    df['new2'] = 11
    
    return df

df2 = add_weekday_column(df)
df2.loc[:, "new_col"] = 10
df2.head()

Unnamed: 0,date,alias,weekday,new_col,new2
0,2020-10-08,a,Thursday,10,11
1,2020-10-08,b,Thursday,10,11
2,2020-10-08,c,Thursday,10,11


In [142]:
def add_moriarty_profile(df):
    """Set date to be NOT Sunday and alias as np.nan."""
    faker_en = Faker('en_GB')
    #
    df_moriarty = df.copy()
    df_moriarty = df.loc[(df.country == "United Kingdom") & (df.crime_type == "weapons sale")] \
                        .sort_values('profit', ascending = False).reset_index()
    hidden_moriarty_name = df_moriarty.name.tolist()[0]
    df_moriarty = df.loc[(df.name == hidden_moriarty_name), :]
#     df_moriarty = df_moriarty_test.copy()
    
    print("Df before moriarty creations shape: {}".format(df.shape[0]))
    print(df.loc[df.name == hidden_moriarty_name])
    
    df_not_moriarty = df.loc[df.name != hidden_moriarty_name]
    
    df_moriarty["date"] = add_date_not_sunday('test')
    df_moriarty["date"] = df_moriarty["date"].astype('datetime64')
    df_moriarty['alias'] = None
    
    df = pd.concat([df_not_moriarty, df_moriarty])
    print("Df after moriarty creations shape: {}".format(df.shape[0]))
    print(df.loc[df.name == hidden_moriarty_name])
    
    return df


df_out = add_moriarty_profile(main_criminals_df)
df_out.head(3)

Df before moriarty creations shape: 1164
                name alias   id                                       address  \
739  Mr. James Evans  None  415  Flat 32 Stuart prairie Lake Melissa EC32 3JQ   

         lat      lon         country    city       date    crime_type  profit  
739  51.6388  -0.1794  United Kingdom  London 2020-09-17  weapons sale  456000  
fake_date: 2020-09-15
Df after moriarty creations shape: 1164
                name alias   id                                       address  \
739  Mr. James Evans  None  415  Flat 32 Stuart prairie Lake Melissa EC32 3JQ   

         lat      lon         country    city       date    crime_type  profit  
739  51.6388  -0.1794  United Kingdom  London 2020-09-15  weapons sale  456000  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,name,alias,id,address,lat,lon,country,city,date,crime_type,profit
0,Guillaume Girard,Princess,0,"4, chemin Alfred Weiss 52789 Dumas-sur-Remy",48.8696,2.4629,France,Paris,2019-12-04,forgery,497
1,Gilbert Neveu,,1,"4, avenue Margaux Barbe 30335 Sainte Benjamin-...",48.9893,2.3868,France,Paris,2020-04-04,theft,43
2,Christophe Dupont,,2,"82, rue de Bertin 91395 Georges",48.7635,2.2631,France,Paris,2020-11-04,theft,244


# THE GOAL

The goal is to identify the name behind which hides Moriarty using the intel (data in the supplied files) from the police, Interpol, and undercover agents about the criminals. 


# SOLUTION

Story text:
-Watson, just like our grand-grand-fathers we are again after Moriarty. 

We need to catch him. H-mmm... I need to be careful here - maybe it is not him, maybe it is her. All we know is 
that someone is masterminding unlawful activities and planning something bad. The Interpol agents, with the help of my boys, collected information that should provide us the clues to determine the name Moriarty's is hiding brhind and arrest him.

-I have a number of .csv and .txt files about criminal activity and high-profile suspicious sales that were sent over from our neighbors: France, Germany, Netherlands, and our own MI-6 in the United Kingdom.

So, the first task would be to combine the data into one table. I requested info on the name, alias, and the location of the last known whereabouts, as latitude and longitude, but since the data comes from all around the Europe they might have named the columns differently.

I am thinking that adding the country to the data might be helpful in our future analysis.

Lastly, from my correspondence with our undercover agents, all the activity seems to be happening around major financial centers. If the city names are not in the data, I suppose you can extract it based on the latitude and logitude. Mmmm... And a map of course, unless your knowledge of Europe's geography is excepitonal. 





Text:
Tasks:
1. Read in data from the files into a separate dataframe and add the country name ('country' column).
2. Identify the city around which the criminals operate. Add it to the dataframe ('city' column).
3. Concatenate dfs into a single dataframe with the four original columns renamed to: [name, alias, latitude, longitude]
4. Fill NAs in aliases with an empty string.


In [3]:
from datetime import datetime
import random
import pandas as pd

In [4]:
#sample one of the csvs
country_ = "France"
file_name = "criminals_{}.csv".format(country_)
df_country = pd.read_csv(file_name, index_col=False)
print(df_country.columns)
df_country.head(2)

Index(['id', 'nom', 'pseudonyme', 'latitude', 'longitude'], dtype='object')


Unnamed: 0,id,nom,pseudonyme,latitude,longitude
0,0,Guillaume Girard,Princess,48.8696,2.4629
1,1,Gilbert Neveu,,48.9893,2.3868


In [5]:
#explore the dataframes: column names, shapes and combine into a single dataframe
country_list = ["United Kingdom", "Germany", "Netherlands", "France"]
dfs_dict = {}
for country_ in country_list:
    file_name = "criminals_{}.csv".format(country_)
    df = pd.read_csv(file_name, index_col=False)
    print(list(df.columns), df.shape)
    df.columns = ["id", "name", "alias", "latitude", "longitude"]
    df["country"] = country_
    dfs_dict[country_] = df  # add data frame to the dict for a future union
print("Len dfs_dict: {}".format(len(dfs_dict)))

# combine(concatenate/union) into a single dataframe
df_criminals_combined = pd.concat(dfs_dict.values())
print("Combined shape: {}".format(df_criminals_combined.shape))
df_criminals_combined.head(3)

['id', 'name', 'alias', 'latitude', 'longitude'] (449, 5)
['id', 'benennen', 'aliasnamen', 'breitengrad', 'länge'] (251, 5)
['id', 'benennen', 'aliasnamen', 'breitengrad', 'länge'] (140, 5)
['id', 'nom', 'pseudonyme', 'latitude', 'longitude'] (324, 5)
Len dfs_dict: 4
Combined shape: (1164, 6)


Unnamed: 0,id,name,alias,latitude,longitude,country
0,0,Dr. June Ellis,Duchess,51.6247,-0.0471,United Kingdom
1,1,Mr. Colin Curtis,,51.3591,-0.1497,United Kingdom
2,2,Rosie Begum,,51.4863,0.1052,United Kingdom


In [6]:
# calculate mean latitude and longitude to identify the major financial centers (cities)
# (copy and paste the lat, lon values into Google Maps)
for country_ in country_list:
    test_df = df_criminals_combined.loc[df_criminals_combined.country == country_]
    print("Country: {}, (lat, lon): {}, {}".format(country_, 
                                                   round(test_df.latitude.mean(), 4), 
                                                   round(test_df.longitude.mean(), 4)))
    print(40 * "*")

Country: United Kingdom, (lat, lon): 51.5117, -0.114
****************************************
Country: Germany, (lat, lon): 50.11, 8.6908
****************************************
Country: Netherlands, (lat, lon): 52.3695, 4.8737
****************************************
Country: France, (lat, lon): 48.856, 2.3511
****************************************


In [7]:
country_list

['United Kingdom', 'Germany', 'Netherlands', 'France']

In [8]:
# add the city name to the df

#it can be done using a series of if/else statements, such as 'if country_ == 'France': city = 'Paris', etc. OR
# using a dictionary as below:
country_city_dict = {"United Kingdom": "London", "Germany": "Frankfurt", "Netherlands": "Amsterdam", "France": "Paris"}
country_city_dict

test_dfs = []
for country_ in country_list:
    test_df = df_criminals_combined.loc[df_criminals_combined.country == country_].copy()
    test_df["city"] = country_city_dict[country_]
    test_dfs.append(test_df)
print("Len test_dfs: {}".format(len(test_dfs)))
df_with_city = pd.concat(test_dfs)
print(df_with_city.shape)
df_with_city.head(5)

Len test_dfs: 4
(1164, 7)


Unnamed: 0,id,name,alias,latitude,longitude,country,city
0,0,Dr. June Ellis,Duchess,51.6247,-0.0471,United Kingdom,London
1,1,Mr. Colin Curtis,,51.3591,-0.1497,United Kingdom,London
2,2,Rosie Begum,,51.4863,0.1052,United Kingdom,London
3,3,Alexander Williams,,51.4247,-0.0116,United Kingdom,London
4,4,Charlotte Taylor,,51.3711,-0.0108,United Kingdom,London


In [9]:
# Fillna in alias.
df_with_city = df_with_city.fillna({"alias": ""})
print("Df shape: {}".format(df_with_city.shape[0]))
df_with_city.sort_values("name").head(5)

Df shape: 1164


Unnamed: 0,id,name,alias,latitude,longitude,country,city
100,100,Aaron de Strigter,,52.1659,4.8324,Netherlands,Amsterdam
139,139,Abbie Jones,,51.5988,-0.3318,United Kingdom,London
446,448,Abbie Quinn,Lil’,51.4066,-0.1305,United Kingdom,London
183,183,Abdul Rhodes,Brandy,51.4636,-0.1,United Kingdom,London
266,266,Abdul Wilson-Ryan,,51.2687,-0.259,United Kingdom,London


# Task 2
Add crime_type and profit info to criminals. 
#(merge/join) criminals table with the crime type and profit information.

- Great, Watson! 
- Now we need to know what everyone of those supspects did wrong, that is the crime type, and desirably, how much they profited from it: Moriarty is not a small fish. 

- You'll need to add the crime type and the profit from the files to the table you already put together. Be mindful of the file types. I also believe that the separator in these file maybe different from the files you used previously.

# Solution (task 2)

In [10]:
df = pd.read_csv("crime_type_profit_France.txt", index_col=False, sep=" ")
print("Columns: ", list(df.columns))

Columns:  ['name', 'crime_type', 'profit']


In [11]:
# union(concatenate) files for the latest crime dates

country_list = ["United Kingdom", "Germany", "Netherlands", "France"]
dfs_dict = {}
for country_ in country_list:
    file_name = "crime_type_profit_{}.txt".format(country_)
    df = pd.read_csv(file_name, index_col=False, sep=" ")
    print(list(df.columns), df.shape)
    df["country"] = country_
    dfs_dict[country_] = df
print("Len dfs_dict: {}".format(len(dfs_dict)))

#combine all dataframes into one
df_crime_type_profit = pd.concat(dfs_dict.values())
print(list(df_crime_type_profit.columns))

df_crime_type_profit.head(10)

['name', 'crime_type', 'profit'] (449, 3)
['name', 'crime_type', 'profit'] (251, 3)
['name', 'crime_type', 'profit'] (140, 3)
['name', 'crime_type', 'profit'] (324, 3)
Len dfs_dict: 4
['name', 'crime_type', 'profit', 'country']


Unnamed: 0,name,crime_type,profit,country
0,Dr. June Ellis,theft,380,United Kingdom
1,Mr. Colin Curtis,theft,241,United Kingdom
2,Rosie Begum,theft,282,United Kingdom
3,Alexander Williams,theft,342,United Kingdom
4,Charlotte Taylor,theft,268,United Kingdom
5,Stacey Whitehead,theft,48,United Kingdom
6,Jennifer Scott,theft,269,United Kingdom
7,Julian Taylor,forgery,154,United Kingdom
8,Heather Perkins,forgery,300,United Kingdom
9,Joan Roberts,theft,313,United Kingdom


In [12]:
# drop duplicates 
df_with_city[["name"]].drop_duplicates().shape[0]

1164

In [13]:
# join main criminal info with crime type and profit
df_city_profit = pd.merge(df_with_city, df_crime_type_profit, on=["name","country"], how="left")
print("Df shape: {}".format(df_city_profit.shape[0]))
print(df_city_profit.columns)
df_city_profit.sort_values('profit', ascending = False).head(4)

Df shape: 1164
Index(['id', 'name', 'alias', 'latitude', 'longitude', 'country', 'city',
       'crime_type', 'profit'],
      dtype='object')


Unnamed: 0,id,name,alias,latitude,longitude,country,city,crime_type,profit
682,233,Cläre Reinhardt-Pechel,,50.1011,8.8447,Germany,Frankfurt,weapons sale,475000
489,40,Kira Junk-Henck,,49.8788,8.5765,Germany,Frankfurt,weapons sale,474000
789,89,Noëlle Buijs-Corstiaens,,52.2545,4.7195,Netherlands,Amsterdam,weapons sale,471000
448,415,Mr. James Evans,,51.6388,-0.1794,United Kingdom,London,weapons sale,456000


In [14]:
#investigate crime types
df_city_profit["crime_type"].value_counts()

theft            465
pickpocketing    235
robbery          197
forgery          116
drug sale         93
weapons sale      58
Name: crime_type, dtype: int64

In [15]:
df_city_profit.loc[df_city_profit["crime_type"] == "weapons sale"].groupby(["country"]).agg({"profit": "sum"})

Unnamed: 0_level_0,profit
country,Unnamed: 1_level_1
France,3186000
Germany,2489000
Netherlands,2112000
United Kingdom,6062000


In [16]:
df_weapons_sales = df_city_profit.loc[df_city_profit["crime_type"] == "weapons sale"].sort_values("profit", ascending = False)
df_weapons_sales.head(5)

Unnamed: 0,id,name,alias,latitude,longitude,country,city,crime_type,profit
682,233,Cläre Reinhardt-Pechel,,50.1011,8.8447,Germany,Frankfurt,weapons sale,475000
489,40,Kira Junk-Henck,,49.8788,8.5765,Germany,Frankfurt,weapons sale,474000
789,89,Noëlle Buijs-Corstiaens,,52.2545,4.7195,Netherlands,Amsterdam,weapons sale,471000
448,415,Mr. James Evans,,51.6388,-0.1794,United Kingdom,London,weapons sale,456000
800,100,Aaron de Strigter,,52.1659,4.8324,Netherlands,Amsterdam,weapons sale,451000


In [17]:
df_weapons_alias_null = df_city_profit.loc[(df_city_profit["crime_type"] == "weapons sale") & (df_city_profit.alias == " ") ]
df_weapons_alias_null.head(5)

Unnamed: 0,id,name,alias,latitude,longitude,country,city,crime_type,profit


# PART 3

Add date (last deal date) Moriarty does not deal on Sundays

In [18]:
id_dates = pd.read_csv("id_dates.csv", index_col=False)
print("id_dates shape: {}".format(id_dates.shape[0]))
id_dates.head(4)

id_dates shape: 1164


Unnamed: 0,id,date,country
0,0,2019-12-04,France
1,1,2020-04-04,France
2,2,2020-11-04,France
3,3,2020-01-12,France


In [19]:
df_weapons_with_dates = pd.merge(df_city_profit, id_dates, on=["id", "country"], how="left")
print(df_weapons_with_dates.shape)
df_weapons_with_dates.head(3)

(1164, 10)


Unnamed: 0,id,name,alias,latitude,longitude,country,city,crime_type,profit,date
0,0,Dr. June Ellis,Duchess,51.6247,-0.0471,United Kingdom,London,theft,380,2020-07-08
1,1,Mr. Colin Curtis,,51.3591,-0.1497,United Kingdom,London,theft,241,2020-06-19
2,2,Rosie Begum,,51.4863,0.1052,United Kingdom,London,theft,282,2020-09-01


In [20]:
df_weapons_with_dates_copy = df_weapons_with_dates.copy()
df_weapons_with_dates_copy["date"] = df_weapons_with_dates_copy["date"].astype("datetime64")
df_weapons_with_dates_copy.dtypes

id                     int64
name                  object
alias                 object
latitude             float64
longitude            float64
country               object
city                  object
crime_type            object
profit                 int64
date          datetime64[ns]
dtype: object

In [40]:
def weekday(date):
    """ input 'date' - datetime object or datetime64"""
    # uncomment line before if the date passed is in string format; 
    # date = datetime.strptime(date, "%Y-%m-%d")  # change the format if necessary
    return date.strftime("%A")

df_weapons_with_dates_copy["weekday"]= df_weapons_with_dates_copy["date"].apply(weekday)
df_weapons_with_weekday = df_weapons_with_dates_copy
df_weapons_with_weekday.head(4)

Unnamed: 0,id,name,alias,latitude,longitude,country,city,crime_type,profit,date,weekday
0,0,Dr. June Ellis,Duchess,51.6247,-0.0471,United Kingdom,London,theft,380,2020-07-08,Wednesday
1,1,Mr. Colin Curtis,,51.3591,-0.1497,United Kingdom,London,theft,241,2020-06-19,Friday
2,2,Rosie Begum,,51.4863,0.1052,United Kingdom,London,theft,282,2020-09-01,Tuesday
3,3,Alexander Williams,,51.4247,-0.0116,United Kingdom,London,theft,342,2020-09-23,Wednesday


In [22]:
df_weapons_not_sunday = df_weapons_with_dates_copy.loc[df_weapons_with_dates_copy.weekday != "Sunday"]
print(df_weapons_not_sunday.shape)


(996, 11)


# Part 4
Watson, we got an important piece of information that Moriarty supervises a network of weapons sales
in the country with the most sales. So, the top seller (in the country with most weapons sales overall) who didn't sell on a Sunday will be him.

In [41]:
df_ws_top = df_weapons_with_weekday.loc[df_weapons_with_weekday["crime_type"] == "weapons sale"].\
                            groupby(["country"]).\
                            agg({"profit": "sum"}).\
                            sort_values('profit', ascending = False).\
                            reset_index()

country_with_top_ws = df_ws_top.iloc[0, 0]
print("country_with_top_ws: {}".format(country_with_top_ws))


country_with_top_ws: United Kingdom


In [42]:
# select top 5 weapons sales records(rows) among those who don't sell on Sunday
df_weapons_UK = df_weapons_with_weekday.loc[df_weapons_with_weekday.country == country_with_top_ws].\
                            sort_values('profit', ascending = False).reset_index()

df_weapons_UK_top5 = df_weapons_UK.head(5)
df_weapons_UK_top5.head()

Unnamed: 0,index,id,name,alias,latitude,longitude,country,city,crime_type,profit,date,weekday
0,448,415,Mr. James Evans,,51.6388,-0.1794,United Kingdom,London,weapons sale,456000,2020-09-17,Thursday
1,263,263,Francis Robinson,,51.5129,-0.0909,United Kingdom,London,weapons sale,433000,2020-08-13,Thursday
2,426,428,Gail Jones,,51.3539,0.1046,United Kingdom,London,weapons sale,387000,2019-12-22,Sunday
3,130,130,Mrs. Francesca Ahmed,,51.506,-0.1847,United Kingdom,London,weapons sale,370000,2020-01-26,Sunday
4,95,95,Mr. Ben Parker,,51.5908,-0.2698,United Kingdom,London,weapons sale,357000,2020-08-29,Saturday


In [43]:
# select top 5 weapons sales records(rows) among those who don't sell on Sunday
df_weapons_with_weekday_top_country = df_weapons_with_weekday.loc[(df_weapons_with_weekday.country == country_with_top_ws) & 
                                  (df_weapons_with_weekday.weekday != 'Sunday')].\
                            sort_values('profit', ascending = False).reset_index()

df_weapons_with_weekday_top_country.head(5)

Unnamed: 0,index,id,name,alias,latitude,longitude,country,city,crime_type,profit,date,weekday
0,448,415,Mr. James Evans,,51.6388,-0.1794,United Kingdom,London,weapons sale,456000,2020-09-17,Thursday
1,263,263,Francis Robinson,,51.5129,-0.0909,United Kingdom,London,weapons sale,433000,2020-08-13,Thursday
2,95,95,Mr. Ben Parker,,51.5908,-0.2698,United Kingdom,London,weapons sale,357000,2020-08-29,Saturday
3,388,389,Ms. Sara Williams,,51.6528,-0.2348,United Kingdom,London,weapons sale,342000,2020-02-12,Wednesday
4,36,36,Linda Lawrence,,51.4828,-0.1148,United Kingdom,London,weapons sale,324000,2019-12-07,Saturday


In [44]:
df_weapons_UK_top5.name.iloc[0]

'Mr. James Evans'

In [None]:
#TODO:
1. add some existing records to the dataset to create duplicates (so the set can be deduplicated)
2. add creation of a table with 2 columns: name and crime_counts. The crime count for top 5 
    is random. This will be the Moriarty. 
3. Create problem text with dynamically inserted values (including the crime count)



In [None]:
#END