# StarCubed: Industry analysis in different countries

""" <br>
author: Arundhishaan <br>
Date: 6/27/2020 <br> 

"""

## Identifying all Industry Types

### Load necessary libraries and read the .csv file containing specific country data

In [1]:
# Loading necessary libraries
import numpy  as np             # Perform mathematical calculations         
import pandas as pd             # Work with DataFrames  
import json                     # Load JSON files
from nltk import flatten        # Convert Nested lists to single list
import copy                     # Creating copies of data 

# Reading the .csv country data
df = pd.read_csv("canada.csv")  # Downloaded from Crunchbase

df.head()

Unnamed: 0,Organization Name,Organization Name URL,Headquarters Location,Description,CB Rank (Organization),Company Type,Website,LinkedIn,Contact Email,Phone Number,...,Last Funding Amount Currency,Last Funding Amount Currency (in USD),Last Funding Type,Total Funding Amount,Total Funding Amount Currency,Total Funding Amount Currency (in USD),Number of Employees,Founders,Industry Groups,Industries
0,Clearbanc,https://www.crunchbase.com/organization/clearbanc,"Toronto, Ontario, Canada",Clearbanc provides growth capital for web-enab...,39,,https://clearbanc.com/,https://www.linkedin.com/company/clearbanc/,info@clearbanc.com,+1 (415) 952 9864,...,CAD,49257014,Series B,119257014,USD,119257014,51-100,"Andrew D'Souza, Ben Sanders, Charlie Feng, Mic...","Commerce and Shopping, Financial Services, Sof...","E-Commerce, Finance, Financial Services, FinTe..."
1,Ada,https://www.crunchbase.com/organization/ada-su...,"Toronto, Ontario, Canada",Ada is a computer software company that featur...,184,For Profit,https://ada.support,https://www.linkedin.com/company/ada-support/,hello@ada.support,1-877-242-8232.,...,USD,44000000,Series B,60620619,USD,60620619,101-250,"David Hariri, Mike Murchison","Artificial Intelligence, Data and Analytics, I...","Artificial Intelligence, Information Technolog..."
2,Repare Therapeutics,https://www.crunchbase.com/organization/repare...,"Saint Laurent, Quebec, Canada",Repare Therapeutics is a developer of oncology...,209,For Profit,http://www.reparerx.com/,https://in.linkedin.com/company/repare-therape...,info@reparerx.com,,...,USD,82500000,Series B,150500000,USD,150500000,Nov-50,"Agnel Sfeir, Daniel Durocher, Frank Sicheri","Biotechnology, Health Care, Science and Engine...","Biotechnology, Health Care, Pharmaceutical, Th..."
3,Cyclica,https://www.crunchbase.com/organization/cyclica,"Toronto, Ontario, Canada",Cyclica is a globally recognized biotechnology...,247,For Profit,http://www.cyclicarx.com,http://www.linkedin.com/company/2620666,inquiries@cyclicarx.com,+1 (416) 304-9201,...,CAD,16857016,Series B,23809848,USD,23809848,Nov-50,"Jason Mitakidis, Naheed Kurji","Artificial Intelligence, Biotechnology, Data a...","Artificial Intelligence, Biotechnology, Medica..."
4,Unbounce,https://www.crunchbase.com/organization/unbounce,"Vancouver, British Columbia, Canada",Unbounce is a drag-and-drop builder that allow...,341,For Profit,http://www.unbounce.com,http://www.linkedin.com/company/503432?,info@unbounce.com,604 484 1354,...,CAD,38297612,Series A,39147612,USD,39147612,101-250,"Carl Schmidt, Carter Gilchrist, Jason Murphy, ...","Data and Analytics, Information Technology, In...","Big Data, Cloud Computing, Enterprise Software..."


### Check for the number of missing values

In [2]:
# Check missing values
df["Industries"].isna().sum()

4

### Total number of Startups in our database

In [3]:
len(df)

547

### Convert the pandas DataFrame into a JSON file and save it in the directory

In [4]:
df.to_json(r'C:\Users\arund\OneDrive\Desktop\Internship\Canada\canada.json',orient='index')

### Read the JSON file and convert the "Industries" from string format to a list iterating over each object in the JSON file

In [5]:
# Read the JSON file
with open('canada.json') as file:
  data = json.load(file)


# Iterate over the JSON object and convert string to list
for i in data:
    if (data[i]["Industries"] is not None):
        data[i]["Industries"] = data[i]["Industries"].split(',')

### Formatting the industry list

In [6]:
for i in data:
    if (data[i]["Industries"] is not None):  
        for index, value in enumerate(data[i]["Industries"]):
            if (value[0] == " "):
                value = value[1:]

            data[i]["Industries"][index] = value.capitalize()

### Get all the industry types in one list and remove duplicates

In [7]:
# Create a list of all industry types
lst = []
for i in data:
    if (data[i]["Industries"] is not None):
        lst.append(data[i]["Industries"])

        
# Using nltk to flatten the list
industries_lst = flatten(lst)


# Remove duplicates 
unique_lst = list(set(industries_lst))

     
# Sort in alphebetical order
ordered_lst = sorted(unique_lst)


# Printing the list of industry types
for index, value in enumerate(ordered_lst):
    print(index+1, value)

1 3d printing
2 3d technology
3 Accounting
4 Ad exchange
5 Ad network
6 Ad targeting
7 Advanced materials
8 Advertising
9 Advertising platforms
10 Aerospace
11 Agriculture
12 Agtech
13 Alternative medicine
14 Analytics
15 Android
16 Angel investment
17 Animal feed
18 Animation
19 App marketing
20 Application performance management
21 Apps
22 Aquaculture
23 Architecture
24 Art
25 Artificial intelligence
26 Assistive technology
27 Augmented reality
28 Automotive
29 Autonomous vehicles
30 B2b
31 Baby
32 Banking
33 Battery
34 Beauty
35 Big data
36 Bioinformatics
37 Biometrics
38 Biopharma
39 Biotechnology
40 Bitcoin
41 Blockchain
42 Blogging platforms
43 Brand marketing
44 Brewing
45 Broadcasting
46 Building maintenance
47 Building material
48 Business development
49 Business intelligence
50 Cad
51 Cannabis
52 Car sharing
53 Catering
54 Chemical
55 Clean energy
56 Cleantech
57 Clinical trials
58 Cloud computing
59 Cloud data services
60 Cloud infrastructure
61 Cloud management
62 Cloud sec

### Finding the total number of industry types

In [8]:
# Total number of industry types
len(ordered_lst)

395

## Removing all Blacklist Industries

### Create a list of industry blacklist

In [9]:
black_lst = set(["Bitcoin","Blockchain","Cryptocurrency"])

### Remove all the startups that belong to the blacklisted industries     

In [10]:
# Creating a copy of the json file
json_copy = copy.deepcopy(data)


# Remove all elements from dictionary associated with blacklist industries
for i in list(json_copy):
    if (json_copy[i]["Industries"] is not None):
        if (black_lst.intersection(set(json_copy[i]["Industries"]))):
            del json_copy[i]

### Convert back the json file to a pandas DataFrame

In [11]:
# Converting json to DataFrame
new_df = pd.DataFrame.from_dict(json_copy, orient='index').reset_index().drop(['index'], axis=1)

### Total Startups after Blacklist

In [12]:
# Remaining startups after industry blacklist
len(new_df)

529

## Preparing DataFrame for Dashboarding

### Duplicate rows exploding the Industries list

In [14]:
#Duplicating rows based on list elements on Industries column
new_df = new_df.explode("Industries")


# Save .csv file on directory
new_df.to_csv("db_canada.csv",index=False)