## Importing Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup

## Importing dataset

In [2]:
dataset = pd.read_csv('crimes_against_women_2001-2014.csv')
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,STATE/UT,DISTRICT,Year,Rape,Kidnapping and Abduction,Dowry Deaths,Assault on women with intent to outrage her modesty,Insult to modesty of Women,Cruelty by Husband or his Relatives,Importation of Girls
0,0,ANDHRA PRADESH,ADILABAD,2001,50,30,16,149,34,175,0
1,1,ANDHRA PRADESH,ANANTAPUR,2001,23,30,7,118,24,154,0
2,2,ANDHRA PRADESH,CHITTOOR,2001,27,34,14,112,83,186,0
3,3,ANDHRA PRADESH,CUDDAPAH,2001,20,20,17,126,38,57,0
4,4,ANDHRA PRADESH,EAST GODAVARI,2001,23,26,12,109,58,247,0


In [3]:
dataset.shape

(10677, 11)

# Data Cleaning

In [4]:
# Checking for missing values.
dataset.isnull().sum()

# There's no missing value.

Unnamed: 0                                             0
STATE/UT                                               0
DISTRICT                                               0
Year                                                   0
Rape                                                   0
Kidnapping and Abduction                               0
Dowry Deaths                                           0
Assault on women with intent to outrage her modesty    0
Insult to modesty of Women                             0
Cruelty by Husband or his Relatives                    0
Importation of Girls                                   0
dtype: int64

In [5]:
# Removing the unnamed column. 
dataset.drop('Unnamed: 0', axis=1, inplace= True)

In [6]:
dataset.head()

Unnamed: 0,STATE/UT,DISTRICT,Year,Rape,Kidnapping and Abduction,Dowry Deaths,Assault on women with intent to outrage her modesty,Insult to modesty of Women,Cruelty by Husband or his Relatives,Importation of Girls
0,ANDHRA PRADESH,ADILABAD,2001,50,30,16,149,34,175,0
1,ANDHRA PRADESH,ANANTAPUR,2001,23,30,7,118,24,154,0
2,ANDHRA PRADESH,CHITTOOR,2001,27,34,14,112,83,186,0
3,ANDHRA PRADESH,CUDDAPAH,2001,20,20,17,126,38,57,0
4,ANDHRA PRADESH,EAST GODAVARI,2001,23,26,12,109,58,247,0


### Data type check

In [7]:
dataset.info()

# all features seem to be in their correct data type.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10677 entries, 0 to 10676
Data columns (total 10 columns):
 #   Column                                               Non-Null Count  Dtype 
---  ------                                               --------------  ----- 
 0   STATE/UT                                             10677 non-null  object
 1   DISTRICT                                             10677 non-null  object
 2   Year                                                 10677 non-null  int64 
 3   Rape                                                 10677 non-null  int64 
 4   Kidnapping and Abduction                             10677 non-null  int64 
 5   Dowry Deaths                                         10677 non-null  int64 
 6   Assault on women with intent to outrage her modesty  10677 non-null  int64 
 7   Insult to modesty of Women                           10677 non-null  int64 
 8   Cruelty by Husband or his Relatives                  10677 non-null  int64 


### Consistency Check

In [8]:
def uniquevals(col):
    return dataset[col].unique()

In [9]:
uniq_states = uniquevals('STATE/UT')
uniq_states
# data is inconsistent some states are in upper case and others in lower case.

array(['ANDHRA PRADESH', 'ARUNACHAL PRADESH', 'ASSAM', 'BIHAR',
       'CHHATTISGARH', 'GOA', 'GUJARAT', 'HARYANA', 'HIMACHAL PRADESH',
       'JAMMU & KASHMIR', 'JHARKHAND', 'KARNATAKA', 'KERALA',
       'MADHYA PRADESH', 'MAHARASHTRA', 'MANIPUR', 'MEGHALAYA', 'MIZORAM',
       'NAGALAND', 'ODISHA', 'PUNJAB', 'RAJASTHAN', 'SIKKIM',
       'TAMIL NADU', 'TRIPURA', 'UTTAR PRADESH', 'UTTARAKHAND',
       'WEST BENGAL', 'A & N ISLANDS', 'CHANDIGARH', 'D & N HAVELI',
       'DAMAN & DIU', 'DELHI', 'LAKSHADWEEP', 'PUDUCHERRY',
       'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar',
       'Chhattisgarh', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh',
       'Jammu & Kashmir', 'Jharkhand', 'Karnataka', 'Kerala',
       'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram',
       'Nagaland', 'Odisha', 'Punjab', 'Rajasthan', 'Sikkim',
       'Tamil Nadu', 'Tripura', 'Uttar Pradesh', 'Uttarakhand',
       'West Bengal', 'A&N Islands', 'Chandigarh', 'D&N Haveli',
       '

From the internet:

The basic difference between a State and a Union Territory is that union territories are directly ruled by the union government, while states have a separate government for administrative purposes. A nation is known by its states and union territories. __India as a country comprises *28* states and *8* Union Territories__.

In [10]:
# Correcting it by taking everything to uppercase.
dataset['STATE/UT']= dataset['STATE/UT'].str.upper()

In [11]:
print(sorted(uniquevals('STATE/UT')))
len(uniquevals('STATE/UT'))

['A & N ISLANDS', 'A&N ISLANDS', 'ANDHRA PRADESH', 'ARUNACHAL PRADESH', 'ASSAM', 'BIHAR', 'CHANDIGARH', 'CHHATTISGARH', 'D & N HAVELI', 'D&N HAVELI', 'DAMAN & DIU', 'DELHI', 'DELHI UT', 'GOA', 'GUJARAT', 'HARYANA', 'HIMACHAL PRADESH', 'JAMMU & KASHMIR', 'JHARKHAND', 'KARNATAKA', 'KERALA', 'LAKSHADWEEP', 'MADHYA PRADESH', 'MAHARASHTRA', 'MANIPUR', 'MEGHALAYA', 'MIZORAM', 'NAGALAND', 'ODISHA', 'PUDUCHERRY', 'PUNJAB', 'RAJASTHAN', 'SIKKIM', 'TAMIL NADU', 'TELANGANA', 'TRIPURA', 'UTTAR PRADESH', 'UTTARAKHAND', 'WEST BENGAL']


39

It's possible not all States and UTs are here, but they shouldn't exceed 36, looking closely by sorting we still see some inconsistencies.
1. 'A & N ISLANDS' and 'A&N ISLANDS'
2. 'D & N HAVELI' and 'D&N HAVELI'
3. 'DELHI' and 'DELHI UT'



In [12]:
# correcting
dataset['STATE/UT'].replace('A&N ISLANDS', 'A & N ISLANDS', inplace= True)
dataset['STATE/UT'].replace('D&N HAVELI', 'D & N HAVELI', inplace= True)
dataset['STATE/UT'].replace('DELHI UT', 'DELHI', inplace= True)

In [13]:
len(uniquevals('STATE/UT'))

36

In [14]:
sorted(uniquevals('STATE/UT'))
# the column values are now consistent.

['A & N ISLANDS',
 'ANDHRA PRADESH',
 'ARUNACHAL PRADESH',
 'ASSAM',
 'BIHAR',
 'CHANDIGARH',
 'CHHATTISGARH',
 'D & N HAVELI',
 'DAMAN & DIU',
 'DELHI',
 'GOA',
 'GUJARAT',
 'HARYANA',
 'HIMACHAL PRADESH',
 'JAMMU & KASHMIR',
 'JHARKHAND',
 'KARNATAKA',
 'KERALA',
 'LAKSHADWEEP',
 'MADHYA PRADESH',
 'MAHARASHTRA',
 'MANIPUR',
 'MEGHALAYA',
 'MIZORAM',
 'NAGALAND',
 'ODISHA',
 'PUDUCHERRY',
 'PUNJAB',
 'RAJASTHAN',
 'SIKKIM',
 'TAMIL NADU',
 'TELANGANA',
 'TRIPURA',
 'UTTAR PRADESH',
 'UTTARAKHAND',
 'WEST BENGAL']

#### Cross Validation.

In [15]:
# Using Beautiful Soup to get data off wikipedia.
'''url = 'https://en.wikipedia.org/wiki/List_of_districts_in_India#:~:text=As%20of%202021%20there%20are,the%202001%20Census%20of%20India.'
state_ut = []

#This part was to access the webpage.

access = requests.get(url)
print(access.status_code)

# This part made me get the source code
source_code = access.content
#print(source_code)

soup = BeautifulSoup(source_code, "html.parser")
# Now that we have put in the source code into the beautiful soup class. We can now get all the links in it.
links = soup.find_all("a")

for link in links:
    # we use the .text or .string function to print out just the text in between each a tag looping through each link
    # with the for loop
    state_ut.append(link.string)
    
state_ut[state_ut.index('Andhra Pradesh'):129]'''

# just so we don't need the internet to get this, i copied it from the list scraped from the internet.
state_utt = ['Andhra Pradesh',
'Arunachal Pradesh',
 'Assam',
 'Bihar',
 'Chhattisgarh',
 'Goa',
 'Gujarat',
 'Haryana',
 'Himachal Pradesh',
 'Jharkhand',
 'Karnataka',
 'Kerala',
 'Madhya Pradesh',
 'Maharashtra',
 'Manipur',
 'Meghalaya',
 'Mizoram',
 'Nagaland',
 'Odisha',
 'Punjab',
 'Rajasthan',
 'Sikkim',
 'Tamil Nadu',
 'Telangana',
 'Tripura',
 'Uttar Pradesh',
 'Uttarakhand',
 'West Bengal',
 'A & N ISLANDS',
 'Chandigarh',
 'Dadra and Nagar Haveli and Daman and Diu',
 'JAMMU & KASHMIR',
 'Ladakh',
 'Lakshadweep', 
 'Delhi',
 'Puducherry']

len(state_utt)

36

In [16]:
# Changing the gotten data to uppercase
state_utt = [c.upper() for c in state_utt]

In [17]:
sorted(state_utt) == sorted(uniquevals('STATE/UT'))
# The two lists aren't the same

False

In [18]:
# A closer look
print(sorted(state_utt))
print('i contain {}'.format(len(state_utt)),'\n')
print(sorted(uniquevals('STATE/UT')))
print('i contain {}'.format(len(uniquevals('STATE/UT'))))

['A & N ISLANDS', 'ANDHRA PRADESH', 'ARUNACHAL PRADESH', 'ASSAM', 'BIHAR', 'CHANDIGARH', 'CHHATTISGARH', 'DADRA AND NAGAR HAVELI AND DAMAN AND DIU', 'DELHI', 'GOA', 'GUJARAT', 'HARYANA', 'HIMACHAL PRADESH', 'JAMMU & KASHMIR', 'JHARKHAND', 'KARNATAKA', 'KERALA', 'LADAKH', 'LAKSHADWEEP', 'MADHYA PRADESH', 'MAHARASHTRA', 'MANIPUR', 'MEGHALAYA', 'MIZORAM', 'NAGALAND', 'ODISHA', 'PUDUCHERRY', 'PUNJAB', 'RAJASTHAN', 'SIKKIM', 'TAMIL NADU', 'TELANGANA', 'TRIPURA', 'UTTAR PRADESH', 'UTTARAKHAND', 'WEST BENGAL']
i contain 36 

['A & N ISLANDS', 'ANDHRA PRADESH', 'ARUNACHAL PRADESH', 'ASSAM', 'BIHAR', 'CHANDIGARH', 'CHHATTISGARH', 'D & N HAVELI', 'DAMAN & DIU', 'DELHI', 'GOA', 'GUJARAT', 'HARYANA', 'HIMACHAL PRADESH', 'JAMMU & KASHMIR', 'JHARKHAND', 'KARNATAKA', 'KERALA', 'LAKSHADWEEP', 'MADHYA PRADESH', 'MAHARASHTRA', 'MANIPUR', 'MEGHALAYA', 'MIZORAM', 'NAGALAND', 'ODISHA', 'PUDUCHERRY', 'PUNJAB', 'RAJASTHAN', 'SIKKIM', 'TAMIL NADU', 'TELANGANA', 'TRIPURA', 'UTTAR PRADESH', 'UTTARAKHAND', 'WEST

### Dadra and Nagar Haveli and Daman and Diu is a union territory in India. The territory was constituted through the merger of the former territories of Dadra and Nagar Haveli and Daman and Diu. Plans for the proposed merger were announced by the Government of India in July 2019, the necessary legislation was passed in the Parliament of India in December 2019 and came into effect on 26 January 2020. 

The most recent year in our dataset is 2014. So i'm guessing they two were different UTs as at the time this data was collected because in our dataset, we have them as two different UTs and we don't have LADAKH in our dataset. So is LADAKH absent from this dataset ?

In [19]:
# to be sure let's look for districts in LADAKH. From the internet, kargil and leh are in LADAKH but we find them in jammu 
# and kashmir.

# so it could mean that the dataset actually contains the whole of india, we can cross valiate the districts to be sure they are
# actually districts in india and know if the dataset contains the whole of india.

dataset[dataset.DISTRICT == 'LEH']

Unnamed: 0,STATE/UT,DISTRICT,Year,Rape,Kidnapping and Abduction,Dowry Deaths,Assault on women with intent to outrage her modesty,Insult to modesty of Women,Cruelty by Husband or his Relatives,Importation of Girls
219,JAMMU & KASHMIR,LEH,2001,0,1,0,1,0,0,0
936,JAMMU & KASHMIR,LEH,2002,3,1,0,1,1,0,0
1659,JAMMU & KASHMIR,LEH,2003,3,1,0,2,0,0,0
2387,JAMMU & KASHMIR,LEH,2004,3,4,0,3,0,1,0
3121,JAMMU & KASHMIR,LEH,2005,5,3,0,4,0,0,0
3855,JAMMU & KASHMIR,LEH,2006,3,5,0,5,0,0,0
4595,JAMMU & KASHMIR,LEH,2007,2,3,0,4,0,0,0
5340,JAMMU & KASHMIR,LEH,2008,6,3,0,7,0,0,0
6104,JAMMU & KASHMIR,LEH,2009,2,1,0,3,0,0,0
6877,JAMMU & KASHMIR,LEH,2010,2,3,0,3,0,0,0


From the internet:

Kargil, portion of the western Ladakh union territory, northwestern India, **formerly part of northwestern Jammu and Kashmir state.**

So kargil was fromerly part of jammu and kasmir.

In [20]:
# checking the District Column
print(sorted(uniquevals('DISTRICT')))
len(uniquevals('DISTRICT'))

['24 PARGANAS NORTH', '24 PARGANAS SOUTH', 'A and N ISLANDS', 'ADILABAD', 'AGAR', 'AGRA', 'AHMEDABAD COMMR.', 'AHMEDABAD RURAL', 'AHMEDNAGAR', 'AHWA-DANG', 'AIZAWL', 'AJMER', 'AKOLA', 'ALAPUZHA', 'ALIGARH', 'ALIRAJPUR', 'ALLAHABAD', 'ALMORA', 'ALWAR', 'AMBALA', 'AMBALA RURAL', 'AMBALA URBAN', 'AMBEDKAR NAGAR', 'AMETHI', 'AMRAVATI COMMR.', 'AMRAVATI RURAL', 'AMRELI', 'AMRITSAR', 'AMRITSAR RURAL', 'AMROHA', 'ANAND', 'ANANTAPUR', 'ANANTNAG', 'ANDAMAN', 'ANGUL', 'ANJAW', 'ANUPPUR', 'ARARIA', 'ARIYALUR', 'ARWAL', 'ASANSOL', 'ASHOK NAGAR', 'AURAIYA', 'AURANGABAD', 'AURANGABAD COMMR.', 'AURANGABAD RURAL', 'AWANTIPORA', 'AZAMGARH', 'Adilabad', 'Agar', 'Agra', 'Ahmedabad City', 'Ahmedabad Rural', 'Ahmednagar', 'Aizawl', 'Ajmer', 'Akola', 'Alapuzha', 'Aligarh', 'Alipurduar', 'Alirajpur', 'Allahabad', 'Almora', 'Alwar', 'Ambala (Rural)', 'Ambala (Urban)', 'Ambedkar Nagar', 'Amethi', 'Amravati Commr.', 'Amravati Rural', 'Amreli', 'Amritsar Rural', 'Amroha', 'Anand', 'Anantapur', 'Anantnag', 'Angul

1605

In [21]:
# we notice some are in uppercase and others in lower, so we correct that.
# Correcting it and making everything in uppercase.
dataset['DISTRICT']= dataset['DISTRICT'].str.upper()

In [22]:
print(sorted(uniquevals('DISTRICT')))
len(uniquevals('DISTRICT'))

# It appears wikipedia doesn't contain all the districts for example we don't see ANTI TERRORIST SQUAD in wikipedia but it's in
# our dataset but all it's values are 0 is it really a district in india ?

['24 PARGANAS NORTH', '24 PARGANAS SOUTH', 'A AND N ISLANDS', 'ADILABAD', 'AGAR', 'AGRA', 'AHMEDABAD CITY', 'AHMEDABAD COMMR.', 'AHMEDABAD RURAL', 'AHMEDNAGAR', 'AHWA-DANG', 'AIZAWL', 'AJMER', 'AKOLA', 'ALAPUZHA', 'ALIGARH', 'ALIPURDUAR', 'ALIRAJPUR', 'ALLAHABAD', 'ALMORA', 'ALWAR', 'AMBALA', 'AMBALA (RURAL)', 'AMBALA (URBAN)', 'AMBALA RURAL', 'AMBALA URBAN', 'AMBEDKAR NAGAR', 'AMETHI', 'AMRAVATI COMMR.', 'AMRAVATI RURAL', 'AMRELI', 'AMRITSAR', 'AMRITSAR RURAL', 'AMROHA', 'ANAND', 'ANANTAPUR', 'ANANTNAG', 'ANDAMAN', 'ANGUL', 'ANJAW', 'ANTI TERRORIST SQUAD', 'ANUPPUR', 'ARARIA', 'ARIYALUR', 'ARVALLI', 'ARWAL', 'ASANSOL', 'ASANSOL-DURGAPUR PC', 'ASHOK NAGAR', 'AURAIYA', 'AURANGABAD', 'AURANGABAD COMMR.', 'AURANGABAD RURAL', 'AWANTIPORA', 'AZAMGARH', 'BADAUN', 'BADDI', 'BADDIPOLICEDIST', 'BAGAHA', 'BAGALKOT', 'BAGESHWAR', 'BAGHPAT', 'BAHRAICH', 'BAKSA', 'BALAGHAT', 'BALASORE', 'BALLARI', 'BALLIA', 'BALOD', 'BALODA BAZAR', 'BALODBAZAR', 'BALRAMPUR', 'BANASKANTHA', 'BANDA', 'BANDIPORA', 'BA

958

# There's lots of corrections to be made here. But one has to be careful

1. AMBALA (RURAL) and AMBALA RURAL they are the same thing, same with AMBALA (URBAN) and AMBALA URBAN. What i think is that ambala is just one and then in the years 2012, 2013 and 2014 we have them recorded for the rural and urban parts notice that AMBALA alone has just dates from 2001 - 2011. So i could work with the three and if i need to get a total sum for AMBALA, i could always use regex.
2. BALODBAZAR and BALODA BAZAR are the same thing, it should be BALODA BAZAR.
3. BEMETARA and BEMETRA are the same thing, it should be BEMETARA.
4. ~~BHOPAL RLY. and BHOPAL RAILWAY are the same, we could just use BHOPAL RAILWAY. Notice that's there's also bhopal so i'm taking this to mean that all incidents that are recorded in bhopal didn't happen in bhopal railway region and it could be that there's more cases in the railway region of bhopal.~~
5. C.I.D. CRIME and CID CRIME seem to be the same, but have most of the observations containing 0. **LEAVE**
6. C. I. D. and CID seems to be the same. Except for GOA **LEAVE** 
7. CHITRAKOOT and CHITRAKOOT DHAM seem to be the same as CHITRAKOOT has data for just year 2014 and CHITRAKOOT DHAM has for 2001 - 2013.
8. D AND N HAVELI and D&N HAVELI should be the consistent.
9. DAKSHIN KANNADA and DAKSHINA KANNADA should be consistent.
10. DATIYA and DATIA should be DATIA.
11. ~~DHANBAD RAILWAY and DHANBAD RLY. are the same, the former contains only year 2014.~~
12. FEROZPUR and FEROZEPUR are the same and should be FIROZPUR.
13. Notice G. R. P. has different states all for year 2014, and G.R.P has data for punjab for 2001 to 2013, so we can assume that the  G. R. P. punjab should be  G.R.P **LEAVE**
14. Again G. R. P. has different states all for year 2014, and G.R.P. has some states that G. R. P. has and these states all don't have data for 2014 which is contained in G. R. P., so i'm assuming that for these states G. R. P. should be G.R.P., the states are UTTAR PRADESH	and HIMACHAL PRADESH. **LEAVE**
15. G.R.P.AJMER and G.R.P. AJMER are the same one just contains data for year 2010.
16. ~~G. R. P. (RLY) and GRP(RLY) are the same. ~~
17. GARO HILLS SOUTH W. and GARO HILLS SOUTH WEST are the same. 
18. Again, we have G. R. P. for HARNAYA and TRIPURA 2014 data and GRP has data for those two states, except 2014, so for those two states we shpuld change G. R. P. to GRP. **LEAVE**
19. GRP RAIPUR and G. R. P. RAIPUR are the same.
20. ~~GUNTAKAL RLY. and GUNTAKAL RAILWAY are the same.~~
21. HOWRAH G.R.P. and HOWRAH G. R. P. are the same. 
22. IGI AIRPORT and I. G. I. AIRPORT are the same.
23. IMPHAL EAST and IMPHAL(EAST) also IMPHAL WEST and IMPHAL (WEST) are the same. **AT THIS POINT I STOPPED NOTING RLY. and RAILWAY CASES. I JUST CORRECTED ALL CASES WITH THE CODE BELOW.**
24. KHARAGPUR G.R.P. and KHARAGPUR G. R. P. are the same.
25. KHARGON and KHARGONE are the same.
26. 'KHASI HILLS SOUTH W.', 'KHASI HILLS SOUTH WEST' are the same.
27. LAHAUL-SPITI and LAHAUL&SPITI are the same.
28. 'MAHENDERGARH', 'MAHENDRAGARH' are the same, should be mahendragarh.
29. 'MALKANGIR', 'MALKANGIRI are the same, should be 'MALKANGIRI.
30. MUNGALI', 'MUNGELI' are the same, should be 'MUNGELI'.
31. 24 PARGANAS NORTH and NORTH 24 PARGANAS are the same, use NORTH 24 PARGANAS.
32. NORTH-EAST and NORTH EAST are the same, since they are both in delhi, use NORTH EAST.
33. NORTH-WEST and NORTH WEST, since they are both in delhi, use NORTH WEST.
34. 'RAILWAYS KASHMIR' AND 'RAILWAYS KMR'are the same, use 'RAILWAYS KASHMIR'
18. ~~We have some rows giving us the total of other rows, we don't want that.~~
19. Notice that we have RAILWAY AND RAILWAYS they aren't the same, one is in delhi, the other in jammu respectively.
20. We have RAMANAGAR and RAMANAGARA in KARNATAKA, we should have just ramangara.
35. STF should be values for delhis S.T.F. but we have assam in STF, basically STF and S.T.F should be the same.**LEAVE** 
39. SEALDAH G. R. P.', 'SEALDAH G.R.P. should be the same, use SEALDAH G.R.P.
40. 'SILIGURI G. R. P.', 'SILIGURI G.R.P.', should be the same, use SILIGURI G.R.P.
41.  'SAS NAGAR', 'SAS NGR', seems to be the same, use SAS NAGAR
42. 'SILIGURI PC', 'SILIGURI_PC' seem to be the same, use SILIGURI PC
43. SOUTH 24 PARGANAS and 24 PARGANAS SOUTH, use SOUTH 24 PARGANAS
44. SOUTH-WEST and SOUTH WEST are the same, use SOUTH WEST.
45. 'SRP (CUTTACK)', 'SRP(CUTTACK)', are the same use SRP (CUTTACK).
46. 'SRP (ROURKELA)', 'SRP(ROURKELA)', are the same use SRP (ROURKELA)
47. THOOTHUKUDI and THOOTHUGUDI are the same, it should be THOOTHUKUDI.
48. TRICHY RAILWAY and RAILWAY TRICH are the same, use TRICHY RAILWAY.
49. UMARIYA and UMARIA are the same, it should be UMARIA.
50. UTTAR KANNADA and UTTARA KANNADA are the same, it should be UTTARA KANNADA
51. 'VILLUPURAM', 'VILUPPURAM' spelling error, use the correct one, 'VILUPPURAM'

In [23]:
# i used this cell to confirm some of the observations above for example, error 51
dataset[dataset.DISTRICT== 'VILLUPURAM']

Unnamed: 0,STATE/UT,DISTRICT,Year,Rape,Kidnapping and Abduction,Dowry Deaths,Assault on women with intent to outrage her modesty,Insult to modesty of Women,Cruelty by Husband or his Relatives,Importation of Girls
571,TAMIL NADU,VILLUPURAM,2001,37,37,14,78,20,42,0
1287,TAMIL NADU,VILLUPURAM,2002,51,53,3,92,17,42,0
2016,TAMIL NADU,VILLUPURAM,2003,39,40,6,100,29,62,0
2745,TAMIL NADU,VILLUPURAM,2004,55,44,5,87,13,59,0
4218,TAMIL NADU,VILLUPURAM,2006,39,26,6,52,17,38,0
4961,TAMIL NADU,VILLUPURAM,2007,58,53,5,72,22,83,0
5716,TAMIL NADU,VILLUPURAM,2008,58,50,6,70,31,88,0
6483,TAMIL NADU,VILLUPURAM,2009,62,66,8,78,31,33,0
7260,TAMIL NADU,VILLUPURAM,2010,50,78,4,80,43,26,0
8047,TAMIL NADU,VILLUPURAM,2011,86,187,7,274,0,140,0


In [24]:
dataset[dataset.DISTRICT== 'VILUPPURAM']
# notice that above doesn't have year 2005 since for that year the district was mis-spelt.

Unnamed: 0,STATE/UT,DISTRICT,Year,Rape,Kidnapping and Abduction,Dowry Deaths,Assault on women with intent to outrage her modesty,Insult to modesty of Women,Cruelty by Husband or his Relatives,Importation of Girls
3478,TAMIL NADU,VILUPPURAM,2005,49,49,9,50,5,59,0


### CONSISTENCY CORRECTIONS.

In [25]:
# correcting all RLY. cases

# First i looped throught the portion of the dataset that contained RLY.
for c in dataset[dataset.DISTRICT.str.contains('RLY.')].DISTRICT:
    
    # next, i split them into two since my aim was to replace the second part. e.g we had VIJAYAWADA RLY. after the split we 
    # have ['VIJAYAWADA', 'RLY.']. In some cases we didn't have up to 2 values in the split list.
    g= list(c.split(' '))
    
    # look at all the printed values of g for better understanding.
    print(g)
    try:
        # i used the try except because some list won't have any value for index 1 and would return an index error.
        if g[1] == 'RLY.':
            
            # here i replaced all RLY. with RAILWAY.
            dataset.DISTRICT.replace(c, g[0]+' RAILWAY', inplace= True)
    except IndexError:
        pass

['GUNTAKAL', 'RLY.']
['SECUNDERABAD', 'RLY.']
['VIJAYAWADA', 'RLY.']
['JAMALPUR', 'RLY.']
['KATIHAR', 'RLY.']
['MUZAFFARPUR', 'RLY.']
['PATNA', 'RLY.']
['DHANBAD', 'RLY.']
['JAMSHEDPUR', 'RLY.']
['BHOPAL', 'RLY.']
['INDORE', 'RLY.']
['JABALPUR', 'RLY.']
['MUMBAI', 'RLY.']
['NAGPUR', 'RLY.']
['PUNE', 'RLY.']
['CHENNAI', 'RLY.']
['TRICHY', 'RLY.']
['G.R.P.(RLY)']
['GUNTAKAL', 'RLY.']
['SECUNDERABAD', 'RLY.']
['VIJAYAWADA', 'RLY.']
['JAMALPUR', 'RLY.']
['KATIHAR', 'RLY.']
['MUZAFFARPUR', 'RLY.']
['PATNA', 'RLY.']
['DHANBAD', 'RLY.']
['JAMSHEDPUR', 'RLY.']
['BHOPAL', 'RLY.']
['INDORE', 'RLY.']
['JABALPUR', 'RLY.']
['MUMBAI', 'RLY.']
['NAGPUR', 'RLY.']
['PUNE', 'RLY.']
['CHENNAI', 'RLY.']
['TRICHY', 'RLY.']
['G.R.P.(RLY)']
['GUNTAKAL', 'RLY.']
['SECUNDERABAD', 'RLY.']
['VIJAYAWADA', 'RLY.']
['JAMALPUR', 'RLY.']
['KATIHAR', 'RLY.']
['MUZAFFARPUR', 'RLY.']
['PATNA', 'RLY.']
['DHANBAD', 'RLY.']
['JAMSHEDPUR', 'RLY.']
['BHOPAL', 'RLY.']
['INDORE', 'RLY.']
['JABALPUR', 'RLY.']
['MUMBAI', 'RLY.']

In [26]:
# The code above corrects all the cases where RLY. and RAILWAY was written making our dataset consistent.
display(dataset[dataset.DISTRICT.str.contains('RLY')])

# the RLY cases left are quite different and we could correct those manually.
# correcting G.R.P.(RLY)
dataset.DISTRICT.replace('GRP(RLY)', 'G. R. P.(RLY)', inplace= True)

Unnamed: 0,STATE/UT,DISTRICT,Year,Rape,Kidnapping and Abduction,Dowry Deaths,Assault on women with intent to outrage her modesty,Insult to modesty of Women,Cruelty by Husband or his Relatives,Importation of Girls
168,GUJARAT,W.RLY,2001,1,0,0,7,0,7,0
702,DELHI,G.R.P.(RLY),2001,1,7,0,4,0,0,0
884,GUJARAT,W.RLY,2002,3,7,0,4,0,5,0
1420,DELHI,G.R.P.(RLY),2002,0,0,0,3,0,0,0
1607,GUJARAT,W.RLY,2003,2,2,0,9,0,4,0
2149,DELHI,G.R.P.(RLY),2003,0,5,3,4,0,0,0
2335,GUJARAT,W.RLY,2004,3,0,0,9,10,4,0
2878,DELHI,G.R.P.(RLY),2004,5,2,2,2,1,0,0
3068,GUJARAT,W.RLY,2005,2,4,0,8,1,1,0
3611,DELHI,G.R.P.(RLY),2005,2,1,1,5,1,0,0


In [27]:
# Removing the total rows
# We have some rows giving us the total of other rows, we don't want that.
# we drop them.
dataset = dataset[~dataset.DISTRICT.str.contains('TOTAL')]

In [28]:
# Changing column names.
dataset.columns = ['state/ut', 'district', 'year', 'rape',
       'kidnapping_and_abduction', 'dowry_deaths', 'assault_on_women',
       'insult_to_modesty', 'cruelty_by_husband_or_relatives',
       'importation_of_girls']

## Data Exploration and Visualization

In [30]:
dataset.corr()
# what i was thinking was since assault on modesty is like touching a woman in the wrong places and because of the 0.7 
# correlation between rape and assault i was thinking that maybe in some of the assault cases , rape was also committed
# so basically a double offense ??

Unnamed: 0,year,rape,kidnapping_and_abduction,dowry_deaths,assault_on_women,insult_to_modesty,cruelty_by_husband_or_relatives,importation_of_girls
year,1.0,0.169368,0.288149,0.025365,0.15491,-0.018629,0.145202,-0.025272
rape,0.169368,1.0,0.597361,0.354274,0.69992,0.199903,0.589191,0.030191
kidnapping_and_abduction,0.288149,0.597361,1.0,0.477835,0.439433,0.14905,0.588252,0.006666
dowry_deaths,0.025365,0.354274,0.477835,1.0,0.253745,0.178263,0.34191,0.096861
assault_on_women,0.15491,0.69992,0.439433,0.253745,1.0,0.365956,0.497796,-0.002388
insult_to_modesty,-0.018629,0.199903,0.14905,0.178263,0.365956,1.0,0.27157,0.001541
cruelty_by_husband_or_relatives,0.145202,0.589191,0.588252,0.34191,0.497796,0.27157,1.0,0.017236
importation_of_girls,-0.025272,0.030191,0.006666,0.096861,-0.002388,0.001541,0.017236,1.0


In [None]:
plt.scatter(dataset['Assault on women with intent to outrage her modesty'], dataset.Rape)

Question 1: How rape cases were reported each year from 2001 to 2014?

In [None]:
rape_cases = pd.DataFrame(dataset.groupby('Year')['Rape'].sum())
rape_cases

In [None]:
sns.scatterplot(data=rape_cases, x='Year', y='Rape')

Question 2: How does the dowry deaths and kidnapping and abduction vary between each states?

In [None]:
crime_state = pd.DataFrame(dataset.groupby('STATE/UT').sum())
crime_state = crime_state.loc[:, ['Dowry Deaths', 'Kidnapping and Abduction']]
crime_state

In [None]:
sns.set(style='whitegrid')
g = sns.PairGrid(crime_state, x_vars=crime_state.columns[1], y_vars=crime_state.columns[0], height=20)
sns.despine(left=True, bottom=True)
g.map(sns.stripplot, size=20, orient="h", palette="ch:s=1,r=-.1,h=1_r", linewidth=2, edgecolor="w")
plt.show()