In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

np.set_printoptions(precision=2)


fruits = pd.read_table('fruit_data_with_colors.txt')

feature_names_fruits = ['height', 'width', 'mass', 'color_score']
X_fruits = fruits[feature_names_fruits]
y_fruits = fruits['fruit_label']
target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon']

X_fruits_2d = fruits[['height', 'width']]
y_fruits_2d = fruits['fruit_label']

X_train, X_test, y_train, y_test = train_test_split(X_fruits, y_fruits, random_state=0)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
# we must apply the scaling to the test set that we computed for the training set
X_test_scaled = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train_scaled, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train_scaled, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test_scaled, y_test)))

example_fruit = [[5.5, 2.2, 10, 0.70]]
example_fruit_scaled = scaler.transform(example_fruit)
print('Predicted fruit type for ', example_fruit, ' is ', 
          target_names_fruits[knn.predict(example_fruit_scaled)[0]-1])

Accuracy of K-NN classifier on training set: 0.95
Accuracy of K-NN classifier on test set: 1.00
Predicted fruit type for  [[5.5, 2.2, 10, 0.7]]  is  mandarin


In [2]:
from sklearn.datasets import make_classification, make_blobs
from matplotlib.colors import ListedColormap
from sklearn.datasets import load_breast_cancer
from adspy_shared_utilities import load_crime_dataset

cmap_bold = ListedColormap(['#FFFF00', '#00FF00', '#0000FF','#000000'])


# synthetic dataset for simple regression
from sklearn.datasets import make_regression
plt.figure()
plt.title('Sample regression problem with one input variable')
X_R1, y_R1 = make_regression(n_samples = 100, n_features=1,
                            n_informative=1, bias = 150.0,
                            noise = 30, random_state=0)
plt.scatter(X_R1, y_R1, marker= 'o', s=50)
plt.show()


# synthetic dataset for more complex regression
from sklearn.datasets import make_friedman1
plt.figure()
plt.title('Complex regression problem with one input variable')
X_F1, y_F1 = make_friedman1(n_samples = 100,
                           n_features = 7, random_state=0)

plt.scatter(X_F1[:, 2], y_F1, marker= 'o', s=50)
plt.show()

# synthetic dataset for classification (binary) 
plt.figure()
plt.title('Sample binary classification problem with two informative features')
X_C2, y_C2 = make_classification(n_samples = 100, n_features=2,
                                n_redundant=0, n_informative=2,
                                n_clusters_per_class=1, flip_y = 0.1,
                                class_sep = 0.5, random_state=0)
plt.scatter(X_C2[:, 0], X_C2[:, 1], c=y_C2,
           marker= 'o', s=50, cmap=cmap_bold)
plt.show()


# more difficult synthetic dataset for classification (binary) 
# with classes that are not linearly separable
X_D2, y_D2 = make_blobs(n_samples = 100, n_features = 2, centers = 8,
                       cluster_std = 1.3, random_state = 4)
y_D2 = y_D2 % 2
plt.figure()
plt.title('Sample binary classification problem with non-linearly separable classes')
plt.scatter(X_D2[:,0], X_D2[:,1], c=y_D2,
           marker= 'o', s=50, cmap=cmap_bold)
plt.show()


# Breast cancer dataset for classification
cancer = load_breast_cancer()
(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)


# Communities and Crime dataset
(X_crime, y_crime) = load_crime_dataset()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [49]:
import requests
from bs4 import BeautifulSoup

# url of the website
doc = "https://innospot.de/en"

# getting response object
res = requests.get(doc)

# Initialize the object with the document
soup = BeautifulSoup(res.content, "html.parser")

# Get the whole body tag
tag = soup.body
a = []
# Print each string recursively
for string in tag.strings:
    if string != '\n' and string != ' ' :
        a.append(string)
a = ' '.join(a)
a   

'403 Forbidden nginx'

In [20]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from collections import Counter
import nltk
text_tokens = word_tokenize(a.lower())
text_tokens

['en',
 'de',
 'fr',
 'what',
 'we',
 'offer',
 'solution',
 'case',
 'studies',
 'about',
 'company',
 'team',
 'culture',
 'konux',
 'labs',
 'contact',
 'career',
 'open',
 'positions',
 'application',
 'process',
 'resources',
 'blog',
 'media',
 'kit',
 'horizon',
 '2020',
 'request',
 'a',
 'demo',
 'transform',
 'railway',
 'operations',
 'for',
 'a',
 'sustainable',
 'future',
 'konux',
 'combines',
 'machine',
 'learning',
 'algorithms',
 'and',
 'iot',
 'to',
 'deliver',
 'software-as-a-service',
 'solutions',
 'for',
 'operation',
 ',',
 'monitoring',
 ',',
 'and',
 'maintenance',
 'process',
 'automation',
 'watch',
 'video',
 'calculate',
 'value',
 'read',
 'our',
 'case',
 'studies',
 '``',
 'smart',
 'technology',
 'makes',
 'switches',
 'more',
 'intelligent',
 'and',
 'rail',
 'transportation',
 'noticeably',
 'more',
 'reliable',
 "''",
 'ronald',
 'pofalla',
 ',',
 'member',
 'of',
 'the',
 'management',
 'board',
 'for',
 'infrastructure',
 'at',
 'deutsche',
 'bah

In [21]:
tokens_without_punc = [w for w in text_tokens if w.isalpha()]
print(tokens_without_punc)

['en', 'de', 'fr', 'what', 'we', 'offer', 'solution', 'case', 'studies', 'about', 'company', 'team', 'culture', 'konux', 'labs', 'contact', 'career', 'open', 'positions', 'application', 'process', 'resources', 'blog', 'media', 'kit', 'horizon', 'request', 'a', 'demo', 'transform', 'railway', 'operations', 'for', 'a', 'sustainable', 'future', 'konux', 'combines', 'machine', 'learning', 'algorithms', 'and', 'iot', 'to', 'deliver', 'solutions', 'for', 'operation', 'monitoring', 'and', 'maintenance', 'process', 'automation', 'watch', 'video', 'calculate', 'value', 'read', 'our', 'case', 'studies', 'smart', 'technology', 'makes', 'switches', 'more', 'intelligent', 'and', 'rail', 'transportation', 'noticeably', 'more', 'reliable', 'ronald', 'pofalla', 'member', 'of', 'the', 'management', 'board', 'for', 'infrastructure', 'at', 'deutsche', 'bahn', 'learn', 'about', 'the', 'collaboration', 'between', 'db', 'and', 'konux', 'the', 'konux', 'system', 'is', 'an', 'solution', 'which', 'uses', 'iiot

In [23]:
stop_words = stopwords.words('english')
tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words ]

In [24]:
print(tokens_without_sw)

['en', 'de', 'fr', 'offer', 'solution', 'case', 'studies', 'company', 'team', 'culture', 'konux', 'labs', 'contact', 'career', 'open', 'positions', 'application', 'process', 'resources', 'blog', 'media', 'kit', 'horizon', 'request', 'demo', 'transform', 'railway', 'operations', 'sustainable', 'future', 'konux', 'combines', 'machine', 'learning', 'algorithms', 'iot', 'deliver', 'solutions', 'operation', 'monitoring', 'maintenance', 'process', 'automation', 'watch', 'video', 'calculate', 'value', 'read', 'case', 'studies', 'smart', 'technology', 'makes', 'switches', 'intelligent', 'rail', 'transportation', 'noticeably', 'reliable', 'ronald', 'pofalla', 'member', 'management', 'board', 'infrastructure', 'deutsche', 'bahn', 'learn', 'collaboration', 'db', 'konux', 'konux', 'system', 'solution', 'uses', 'iiot', 'devices', 'artificial', 'intelligence', 'improve', 'network', 'availability', 'extend', 'asset', 'lifetime', 'reduce', 'costs', 'continuously', 'monitors', 'analyzes', 'health', 'ke

In [29]:
from collections import Counter
Counter(tokens_without_sw).most_common(50)[-10:]

[('fr', 1),
 ('culture', 1),
 ('labs', 1),
 ('contact', 1),
 ('open', 1),
 ('positions', 1),
 ('application', 1),
 ('resources', 1),
 ('blog', 1),
 ('horizon', 1)]

In [47]:
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from collections import Counter
import nltk
from collections import Counter

def main():
    print("This is the official playground for this programming exercise")

    urls = [
        "https://innospot.de/en",
        "https://www.konux.com/",
        "https://techcrunch.com/",
        "https://www.telekom.com/en",
        "https://www.commerzbank.de/portal/en/englisch/english.html"
    ]
    box = []
    for doc in urls : 
        # getting response object
        res = requests.get(doc)

        # Initialize the object with the document
        soup = BeautifulSoup(res.content, "html.parser")

        # Get the whole body tag
        tag = soup.body
        
        # Print each string recursively
        for string in tag.strings:
            if string != '\n' and string != ' ' :
                box.append(string)
    box = ' '.join(box)    
    text_tokens = word_tokenize(box.lower())
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]
    stop_words = stopwords.words('english')
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words ]

    count_max  = Counter(tokens_without_sw).most_common(50)[:10]
    print("10 most used keywords :", count_max)
    print("********************")
    count_min  = Counter(tokens_without_sw).most_common(50)[-10:][::-1]
    print("10 fewest used keywords :", count_min)
if __name__ == "__main__":
    main()

This is the official playground for this programming exercise
10 most used keywords : [('telekom', 53), ('deutsche', 34), ('hours', 31), ('ag', 23), ('media', 21), ('oct', 20), ('special', 16), ('data', 15), ('der', 15), ('new', 14)]
********************
10 fewest used keywords : [('public', 6), ('services', 6), ('privacy', 6), ('work', 6), ('us', 6), ('learn', 6), ('open', 6), ('name', 7), ('bei', 7), ('online', 7)]


NameError: name 'main' is not defined