In [3]:
"""
name, address, country, coordinates, job, company, current_location, birthday,
catch_phrase, word, words, (characteristic_features), latitude, longitude

1165 total criminals:

2019/10/30
Purpose: to generate a table with fake data
"""
import os

from datetime import datetime
# import random
import numpy as np
import pandas as pd
from faker import Faker

from faker.generator import random

Faker.seed(4321)
faker = Faker()
data_folder = ".data"



main_dict = {'France': {'number_of_agents': random.randint(250,350), 'city': 'Paris',
                    'city_coordinates': ('48.8566', '2.3522'),
                        'faker_abbrev': 'fr_FR'},
            'United Kingdom': {'number_of_agents': random.randint(250,350), 'city': 'London',
                    'city_coordinates': ('51.5074', '-0.1278'),
                               'faker_abbrev': 'en_GB'},
            'Germany': {'number_of_agents': random.randint(250,350), 'city': 'London',
                    'city_coordinates': ('50.1109', '8.6821'),
                        'faker_abbrev': 'de_DE'},
            'Netherlands': {'number_of_agents': random.randint(250,350), 'city': 'Amsterdam',
             'city_coordinates': ('52.3667', '4.8945'), 'faker_abbrev':
                                'nl_NL'}
            }



def parse_main_dict():
    """Parses dict to get the lists of
    countries, cities, and fakers. Fakers allow generation of region specific fake data.
    Also generates total number of agents
    """
#     Faker.seed(4321)
    
    countries = main_dict.keys()
    cities = [v['city'] for v in main_dict.values()]
    fakers = [Faker(v['faker_abbrev']) for v in main_dict.values()]
    total_agents = sum([v['number_of_agents'] for v in main_dict.values()])


    return fakers, countries, cities, total_agents


def generate_lat_lon(country, main_dict):
    """
    Generates latitude and longitude for a city in the country. 
    The values have a defined range of randomness near
    the city (the coordinates of the city are from the main_dict).
    """
    lats = []
    lons = []
    n_agents = main_dict[country]['number_of_agents']
    print("n_agents: ", n_agents)
    for i in range(n_agents):
        lat = float(main_dict[country]['city_coordinates'][0])
        lon = float(main_dict[country]['city_coordinates'][1])
        dev = 0.25
        min_lat, max_lat = lat - dev, lat + dev
        min_lon, max_lon = lon - dev, lon + dev
        round_to = 4
        lat1 = round(random.uniform(min_lat, max_lat), round_to)
        lon1 = round(random.uniform(min_lon, max_lon), round_to)
        lats.append(str(lat1))
        lons.append(str(lon1))
    
    return lats, lons


def generate_aliases(n_agents):
    """"""
    with open('nicknames_str.txt', 'r') as f:
        aliases = [i.capitalize() for i in f.read().split()]
                    
    aliases_unique = list(set(aliases))
    aliases = aliases_unique + [None for i in range(n_agents - len(aliases_unique))]
    random.shuffle(aliases)
    
    return aliases


def generate_country_criminals_df(regional_faker, country):

    """
    Creates a dataframe with columns: name, nickname, address, city, 
    country, latitude, longitude.
    """

    n_agents = main_dict[country]['number_of_agents']
    
    name_col = [regional_faker.name() for i in range(n_agents)]
    address_col = [regional_faker.address().replace('\n', ' ') for i in
                   range(n_agents)]

    lats_col, lons_col = generate_lat_lon(country, main_dict)
    cols_data = [name_col, address_col, lats_col, lons_col]
    
    column_names = ['name', 'address', 'lat', 'lon']
         
    country_criminals_dict = {col: data for (col, data) in zip(column_names, cols_data)}
    df = pd.DataFrame(country_criminals_dict)
    
    # add columns of country and city
    df['country'] = country
    df['city'] = main_dict[country]['city']
    df['id'] = df.index
    
    df = df[['name', 'id', 'address', 'lat', 'lon', 'country', 'city']]

    return df


def add_date_column(df):
    """Creates random dates within the last year. """
    faker_en = Faker('en_GB')
    
    def date_using_faker(x):
        return faker_en.date_between(start_date='-360d', end_date='today')

    df['date'] = df['name'].apply(date_using_faker)  # can use any column; here we are using 'name'
    df['date'] = df['date'].astype('datetime64')
    
    return df


def add_date_not_sunday(value):
    """Create fake date that is not a Sunday"""
    def weekday(date):
        """ input 'date' - datetime object or datetime64"""
        # uncomment line before if the date passed is in string format; 
        # date = datetime.strptime(date, "%Y-%m-%d")  # change the format if necessary
        return date.strftime("%A")
    
    faker_en = Faker('en_GB')
    fake_date = faker_en.date_between(start_date='-360d', end_date='today')
    
    # fake_date = date_using_faker('mock_arg')
    while weekday(fake_date) == 'Sunday': 
        fake_date = faker_en.date_between(start_date='-360d', end_date='today')
    print("fake_date: {}".format(fake_date))
    return fake_date


def add_moriarty_profile(df):
    """Set date to be NOT Sunday and alias as np.nan."""
    faker_en = Faker('en_GB')

    df_moriarty = df.loc[(df.country == "United Kingdom") & (df.crime_type == "weapons sale")] \
                        .sort_values('profit', ascending = False).reset_index()
    hidden_moriarty_name = df_moriarty.name.tolist()[0]
    df_moriarty = df.loc[df.name == hidden_moriarty_name]
    df_moriarty = df_moriarty.copy()
    
    print("Df before moriarty creations shape: {}".format(df.shape[0]))
    print(df.loc[df.name == hidden_moriarty_name])
    
    df_not_moriarty = df.loc[df.name != hidden_moriarty_name]
    
    df_moriarty["date"] = add_date_not_sunday('test')
    df_moriarty["date"] = df_moriarty["date"].astype('datetime64')
    df_moriarty['alias'] = None
    
    df = pd.concat([df_not_moriarty, df_moriarty])
    print("Df after moriarty creations shape: {}".format(df.shape[0]))
    print(df.loc[df.name == hidden_moriarty_name])
    
    return df


def generate_crime_types(df):
    """
    Generate a column of crime types  for all criminals.
    Each type has its own defined fraction of all criminals to ensure
    weapons sales has the most sales (in money units) and 
    that other crime types profits look realistic.
    
    """
    
    crimes_dict = {'weapons sale': {'factor':100.0, 'fraction': 0.05}, 
               'drug sale': {'factor':9.0, 'fraction': 0.08}, 
               'robbery': {'factor':0.2, 'fraction': 0.17}, 
               'forgery': {'factor':0.12, 'fraction': 0.10},
               'theft':{'factor':0.08, 'fraction': 0.4},
               'pickpocketing': {'factor':0.01, 'fraction': 0.2}
              }
    
    full_crimes_list = []
    for k in list(crimes_dict.keys()):
        if k != list(crimes_dict.keys())[-1]:
            times = int(df.shape[0]*crimes_dict[k]['fraction'])
        else:
            times = df.shape[0] - len(full_crimes_list)
        crimes = [k for i in range(times)]
        full_crimes_list += crimes

    print("df shape0: ", df.shape[0])

    random.shuffle(full_crimes_list)  
    print(len(full_crimes_list))
    print(full_crimes_list[0:10])
    df.loc[:, 'crime_type'] = full_crimes_list

    
    
    def generate_criminal_profits(crime_type):
        """
        Adds profit information for each criminal.
        Factor from the dict is used to make crime types profits look realistic
        """
        return int(-( crimes_dict[crime_type]['factor'] * (-random.randrange(100, 5000, 10)) // 1))

    df['profit'] = df['crime_type'].apply(generate_criminal_profits)
    
    return df


def split_main_df_into_countries(df):
    """
    Split the main df into 5 dfs based on the country for unioning in the test.
    Include columns: name, alias, lat, lon.
    (with names specific to the corresponding countries).
    Save to the current folder.
    """
    df_all_countries = df.copy()

    country_list = df_all_countries["country"].unique().tolist()
    
    dfs_dict = {}
    for num, country_ in enumerate(country_list):
        # print(country_)
        df_country = df_all_countries.loc[(df_all_countries.country == country_)]
        df = df_country.copy()
        df = df[["id", "name", "alias", "lat", "lon"]]
        
        # add country-specific column names
        if country_ == "Germany" or country_ == "Netherlands":
            cols = ["id", "benennen", "aliasnamen", "breitengrad", "länge"]
        elif country_ == "France":
            cols = ["id", "nom", "pseudonyme", "latitude", "longitude"]
        else:
            cols = ["id", "name", "alias", "latitude", "longitude"]
            
        # assign column names
        df.columns = cols
        # save as file with country name
        file_name = "criminals_{}.csv".format(country_)
        print("Saved: {}".format(file_name))
        df.to_csv(file_name, header = True, index = False)
        
        # generate crime_type, profit csvs
        df = df_country.copy()
        df = df[["name", "crime_type", "profit"]]
        file_name = "crime_type_profit_{}.txt".format(country_)
        print("Saved: {}".format(file_name))
        df.to_csv(file_name, sep=' ', header = True, index = False)
        
        

def save_id_date_df(df):
    """
    Add date of the last crime. Leave only 'id' and 'date' (for joining on id)
    """

    df_id_date = df[["id", "date", "country"]]
    file_name = "id_dates.csv"
    df_id_date.to_csv(file_name, header = True, index=False)
    print("Saved: {}".format(file_name))

    
def add_weekday_column(df):
    """"""
    def weekday(date):
        """ input 'date' - datetime object or datetime64"""
        return date.strftime("%A")

    df["weekday"]= df["date"].apply(weekday)
    
    return df
    
    
def identify_top_country(df):
    """"""
    df_ws_top = df.loc[df["crime_type"] == "weapons sale"].\
                            groupby(["country"]).\
                            agg({"profit": "sum"}).\
                            sort_values('profit', ascending = False).\
                            reset_index()

    country_with_top_weapons_sales = df_ws_top.country.tolist()[0]
    
    return country_with_top_weapons_sales


def identify_moriarty(df, country_with_top_ws):
    """"""
    # select top 5 weapons sales records(rows) among those who don't sell on Sunday
    df = df.loc[(df.country == country_with_top_ws) & 
                                  (df.weekday != 'Sunday')].\
                            sort_values('profit', ascending = False).reset_index()

    moriarty_name = df.name[0]
    
    return moriarty_name


def get_correct_answer(df, save=True):
    """"""
    top_country = identify_top_country(df)
    
    # update the puzzle text with the country name
    with open('puzzle.txt') as f:
        puzzle_text = f.read()
    puzzle_text = puzzle_text.format(top_country)
    with open('puzzle.txt', 'w') as f:
        f.write(puzzle_text)
        
    # identify Moriarty's name using the top country
    df_with_weekday = add_weekday_column(df)
    moriarty_name = identify_moriarty(df, top_country)
    
    if save:
        with open('moriarty_name.txt', 'w') as f:
            f.write(moriarty_name)
            
    return moriarty_name
    
    

def generate_df_with_criminals():
    """
    Creates a pandas dataframe of criminals for all countries.
    1.Generates aliases.
    2. Iterates over countries to create dataframe of criminals per country.
    3. Combines all country dfs into the main dataframe.
    """
    fakers, countries, cities, n_agents = parse_main_dict()
    
                    
    country_criminals_dfs = []
    for faker, country in zip(fakers, countries):
        df = generate_country_criminals_df(faker, country)
        country_criminals_dfs.append(df)
              
    df = pd.concat(country_criminals_dfs, ignore_index=True, sort=False)
    aliases = generate_aliases(n_agents)
    df['alias'] = pd.Series(aliases)

    print("Initial shape: {}".format(df.shape[0]))
    df.drop_duplicates(subset=['name'], inplace=True)
    print("Initial shape2: {}".format(df.shape[0]))

    df = add_date_column(df)
    print("Final shape: {}".format(df.shape[0]))
    print(df.columns.tolist())
    df = generate_crime_types(df)
    
    df = df[['name', 'alias', 'id', 'address', 'lat', 'lon', 'country', \
             'city', 'date', 'crime_type', 'profit']]

    df_copy = df.copy()
    df = add_moriarty_profile(df_copy)
    split_main_df_into_countries(df)  # saves csvs
    
    save_id_date_df(df)  # save id and date for future join
    
    get_correct_answer(df)

    return df


main_criminals_df = generate_df_with_criminals()
print("main_criminals_df count: {}".format(main_criminals_df.shape[0]))
main_criminals_df.sort_values("name").head(5)
      
main_criminals_df.to_csv('criminals.csv', header=True, index=False)


n_agents:  282
n_agents:  257
n_agents:  302
n_agents:  258
Initial shape: 1099
Initial shape2: 1099
Final shape: 1099
['name', 'id', 'address', 'lat', 'lon', 'country', 'city', 'alias', 'date']
df shape0:  1099
1099
['robbery', 'theft', 'pickpocketing', 'theft', 'pickpocketing', 'theft', 'robbery', 'theft', 'pickpocketing', 'pickpocketing']
Df before moriarty creations shape: 1099
             name alias  id                            address      lat  \
378  Tracey Smith  None  96  6 Garry village Batestown E21 3ZH  51.5867   

        lon         country    city       date    crime_type  profit  
378  0.0663  United Kingdom  London 2020-08-28  weapons sale  479000  
fake_date: 2019-12-27
Df after moriarty creations shape: 1099
             name alias  id                            address      lat  \
378  Tracey Smith  None  96  6 Garry village Batestown E21 3ZH  51.5867   

        lon         country    city       date    crime_type  profit  
378  0.0663  United Kingdom  London 20