## Structured versus unstructured data

### Example of data preprocessing

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
sentence = "This Wednesday morn, are you early to rise? Then look East. The Crescent Moon joins Venus & Saturn. Afloat in the dawn skies"

In [6]:
vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words="english")

In [8]:
splited_sentence = sentence.split(" ")
X = vectorizer.fit_transform(splited_sentence)
# print(vectorizer.get_feature_names())
freqs = [
    (word, X.getcol(idx).sum()) for word, idx in vectorizer.vocabulary_.items()
]

for phrase, times in sorted(freqs, key = lambda x: -x[1])[:25]:
    print(phrase, times)
    
print(len(sentence))

(u'east', 1)
(u'afloat', 1)
(u'look', 1)
(u'rise', 1)
(u'wednesday', 1)
(u'moon', 1)
(u'early', 1)
(u'morn', 1)
(u'crescent', 1)
(u'joins', 1)
(u'venus', 1)
(u'saturn', 1)
(u'dawn', 1)
(u'skies', 1)
124


In [9]:
# relative length
print(len(sentence)/30.0)

4.13333333333


### Example - world alcohol comsumption data

In [10]:
import pandas as pd

In [11]:
# CM:: read csv form url
drinks = pd.read_csv('https://raw.githubusercontent.com/sinanuozdemir/principles_of_data_science/master/data/chapter_2/drinks.csv')

In [12]:
# CM:: explore the data's first five rows
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF


In [13]:
# CM:: use pandas to get basic summary statistics about the column `continent`
drinks["continent"].describe()

count     170
unique      5
top        AF
freq       53
Name: continent, dtype: object

In [14]:
# CM:: describing a quantitative colum
drinks["beer_servings"].describe()

count    193.000000
mean     106.160622
std      101.143103
min        0.000000
25%       20.000000
50%       76.000000
75%      188.000000
max      376.000000
Name: beer_servings, dtype: float64

## The ordinal level

### Measures of center

In [4]:
import numpy

results = [5, 4, 3, 4, 5, 3, 2, 5, 3, 2, 1, 4, 5, 3, 4, 4, 5, 4, 2, 1, 4, 3, 5, 4, 3, 2, 4, 4, 5, 4, 3, 2, 1]

In [5]:
sorted_results = sorted(results)
print(sorted_results)

[1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5]


In [6]:
print(numpy.mean(results))

3.4242424242424243


In [7]:
print(numpy.median(results))

4.0


## The interval level

### Measures of center

In [8]:
import numpy

temps = [31, 32, 32, 31, 28, 29, 31, 38, 32, 31, 30, 29, 30, 32, 26]

In [9]:
print(numpy.mean(temps))

30.8


In [10]:
print(numpy.median(temps))

31.0


### Measures of variation

#### Standard deviation

In [12]:
mean = numpy.mean(temps)
print(mean)

30.8


In [13]:
squared_differences = []

In [14]:
for temperature in temps:
    difference = temperature - mean
    # CM:: how far is the point from the mean
    
    squared_difference = difference**2
    # CM:: square the difference
    
    squared_differences.append(squared_difference)
    # CM:: add to the list

average_squared_difference = numpy.mean(squared_differences)
# CM:: this is the "variance"

standar_deviation = numpy.sqrt(average_squared_difference)
# CM:: holds the result

print(standar_deviation)

2.535087112244206


## The ratio level

### Measures of center

In [15]:
num_items = len(temps)
product = 1.

for temperature in temps:
    product *= temperature

geometric_mean = product**(1./num_items)

print(geometric_mean)

30.6996443763
