In [1]:
import pandas as pd
import logging
import re

In [2]:
df = pd.read_csv('/Users/aaron/Desktop/convertcsv.csv')

In [4]:
# Fix column name
df.rename(columns={"Occured": "Occurred"}, inplace=True)

In [5]:
# Drop nan and useless columns
del df['Disposition']
del df['UCPD_ID']
df.dropna(axis=0, how='all', inplace=True)

In [6]:
df = df.applymap(lambda df: str.lower(df).strip() if isinstance(df, str) else df)

In [7]:
# Remove bad data
remove = 'void|:|no incident reports'
df = df[~df.Incident.str.contains(f'{remove}')]

In [8]:
# Tag common incidents using a regex
common_incidents = 'lost|assault|theft|robbery|battery|mental health|burglary'
regex = re.compile(f'{common_incidents}')

def tag_incident(incident):
    try:
        return re.findall(regex, incident)[0]
    except TypeError:
        return pd.np.nan
    except IndexError:
        return pd.np.nan

# Add Tag column
df['Tag'] = df['Incident'].apply(tag_incident)

In [9]:
# Total tags per incident
print(df['Tag'].value_counts())

theft            2825
lost              672
robbery           564
burglary          423
battery           368
assault           130
mental health     117
Name: Tag, dtype: int64


## Approximate time of incident

In [11]:
date = df.Occurred.str.split(pat=r'to')

# Fix specific values
date.loc[39] = ['7/11/2010 12:07 am']
date.loc[730] = ['11/23/10 10:30 am',  '2:30 pm']

In [22]:
# Configure logging settings
logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)

file_handler = logging.FileHandler('failed_guess_date.log')
formatter = logging.Formatter('%(levelname)s:%(name)s:%(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)



In [23]:
def guess_date(date):
    """
    Takes a pandas Series of lists containing datetime string
    information and approximates datetime.

    Returns
    -------
    Datetime type
    """
    if isinstance(date, float):
        logger.warning(f'Failed to parse: {date}')
        return pd.Timestamp(pd.np.nan) 
    
    try:
        if len(date) == 1:
            return pd.to_datetime(date[0], errors='coerce')

        elif len(date) == 2:
            time1 = pd.to_datetime(date[0])
            time2 = pd.to_datetime(re.findall(r"\d+/\d+/\d+", date[0])[0] + date[1])

            diff =  time2 - time1

            if diff < pd.Timedelta(6, 'h'):
                return time1 + diff/2
            else:
                return pd.Timestamp(pd.np.nan)

        else:
            time1 = pd.to_datetime(date[0])
            time2 = pd.to_datetime(re.findall(r"\d+/\d+/\d+", date[1])[0])
            diff = time2 - time1
            
            if diff < pd.Timedelta(6, 'h'):
                return time1 + diff/2
            else:
                return pd.Timestamp(pd.np.nan)
    
    except ValueError as err:
        logger.warning(f'{err}: {date}')
        return pd.Timestamp(pd.np.nan)

    except IndexError as err:
        logger.warning(f'{err}: {date}')
        return pd.Timestamp(pd.np.nan)
    
    except OutOfBoundsDatetime as err:
        logger.warning(f'{err}: {date}')
        return pd.Timestamp(pd.np.nan)

0                                [6/28/10 2:45 pm]
1           [6/29/10 ,  7/1/10 3:00 pm ,  1:50 pm]
2          [6/29/10 ,  7/1/10 5:30 pm ,  12:30 pm]
3          [6/23/10 ,  6/29/10 9:00 am ,  5:00 pm]
4                                 [7/1/10 6:15 pm]
5                                [7/1/10 10:00 pm]
6                                [7/1/10 11:50 pm]
7                                [7/1/10 10:45 pm]
8                                 [7/2/10 1:45 am]
9                                [7/2/10 10:20 pm]
10                                [7/3/10 2:05 am]
11                                [7/3/10 1:20 pm]
12                     [7/3/10 4:00 pm ,  5:00 pm]
13                               [7/4/10 12:00 am]
14                                [7/4/10 2:00 am]
15                                [7/4/10 7:20 am]
16                    [7/5/10 7:00 am ,  11:45 am]
17                                [7/5/10 7:08 pm]
20           [7/3/10 ,  7/6/10 2:00 pm ,  7:00 am]
21                             

In [1]:
import pandas as pd

In [18]:
a = pd.DataFrame([[3,4], [1,1]], columns=['hi', 'world'])
#a['foo'], a['bar'] = (2.2,2)
a

Unnamed: 0,hi,world
0,3,4
1,1,1


In [19]:
a['fdf'], a['fd'] = a

In [20]:
a

Unnamed: 0,hi,world,fdf,fd
0,3,4,hi,world
1,1,1,hi,world


In [22]:
!pip tqdm

ERROR: unknown command "tqdm"


In [33]:
import pandas as pd
import numpy as np
from tqdm import tqdm

df = pd.DataFrame(np.random.randint(0, 100, (100000, 6)))

# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="my bar!")

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
df.progress_apply(lambda x: x**2)

ModuleNotFoundError: No module named 'tqdm'

In [1]:
import tqdm

ModuleNotFoundError: No module named 'tqdm'

In [29]:
!pip install tqdm



In [35]:
!python -V

Python 3.6.5
