# Mongo

## Importing libraries and setting up connection

In [42]:
from pymongo import MongoClient
import pandas as pd
import time
import numpy as np
import re

In [43]:
client = MongoClient("localhost:27017")

In [44]:
client.list_database_names()

['Ironhack', 'admin', 'config', 'local']

In [45]:
db = client["Ironhack"]

In [46]:
c = db.get_collection("companies")

## Queryng

### Three cities are selected as candidates based on the location of the video game companies that have raised the most money in total.

In [47]:
#c.distinct("category_code")

In [59]:
def clean_data(results):
    
    """
    This function queries a mongo db called companies to retrieve information about gaming companies
    that have raised more than 1M (€, $ or ¥) and have offices in cities.
    
    It returns a pandas dataframe with the company name, the total amount of money raised,
    and the city of the office.
    """
    
    #query the companies db to 
    query = {"category_code": "games_video",
             "total_money_raised": {"$regex": re.compile(r"^(€|\$|¥)[1-9]\d*M$")},
             "offices.city": {"$exists": True, "$ne": None}}
    projection = {"name": 1, "_id" : 0, "total_money_raised": 1, "offices.city": 1}
    results = list(c.find(query, projection).sort("total_money_raised", -1))
    
    # convert into a df and clean the data
    df = pd.DataFrame(results)
    df['offices'] = df['offices'].apply(lambda x: x[0]['city'] if x else None)
    df['offices'].replace('', np.nan, inplace=True)
    df.dropna(subset=['offices'], inplace=True)
    pd.set_option('display.max_rows', None)
    
    return df


In [60]:
df["offices"].value_counts().head(3)

offices
San Francisco    15
New York         13
Santa Monica      5
Name: count, dtype: int64

### Nearby companies that also do design.

In [63]:
db.companies.distinct("category_code")

[None,
 'advertising',
 'analytics',
 'automotive',
 'biotech',
 'cleantech',
 'consulting',
 'design',
 'ecommerce',
 'education',
 'enterprise',
 'fashion',
 'finance',
 'games_video',
 'government',
 'hardware',
 'health',
 'hospitality',
 'legal',
 'local',
 'manufacturing',
 'medical',
 'messaging',
 'mobile',
 'music',
 'nanotech',
 'network_hosting',
 'news',
 'nonprofit',
 'other',
 'photo_video',
 'public_relations',
 'real_estate',
 'search',
 'security',
 'semiconductor',
 'social',
 'software',
 'sports',
 'transportation',
 'travel',
 'web']

In [64]:
query = {"category_code": "design", "category_code": "photo_video", "category_code": "fashion"}
projection = {"name": 1, "_id":0, "category_code": 1,  "offices.city": 1}

list(c.find(query, projection).sort("category_code", 1))

[{'name': 'Gilt Groupe',
  'category_code': 'fashion',
  'offices': [{'city': 'New York'}]},
 {'name': 'Nike',
  'category_code': 'fashion',
  'offices': [{'city': 'Beaverton'}]},
 {'name': 'Stylesight',
  'category_code': 'fashion',
  'offices': [{'city': 'NEW YORK'}]},
 {'name': 'Stylesight',
  'category_code': 'fashion',
  'offices': [{'city': 'NEW YORK'}]},
 {'name': 'Refinery29',
  'category_code': 'fashion',
  'offices': [{'city': 'New York'}]},
 {'name': 'Geelbe',
  'category_code': 'fashion',
  'offices': [{'city': 'Bogotá'}]},
 {'name': 'thredUP',
  'category_code': 'fashion',
  'offices': [{'city': 'San Francisco'}]},
 {'name': 'Simon Property Group',
  'category_code': 'fashion',
  'offices': [{'city': 'Indianapolis'}]},
 {'name': 'HauteLook',
  'category_code': 'fashion',
  'offices': [{'city': 'Los Angeles'}]},
 {'name': 'Chicisimo',
  'category_code': 'fashion',
  'offices': [{'city': 'Bilbao'}]}]