### Processing

In [None]:
season = ""
episode = ""
scene = ""
data = []
with open("data/all_scripts.txt") as f:
    for line in f.readlines():
        line = line[:-1]
        if line.startswith(">> "):
            season = int(line[10:12])
            episode = line[3:]
            continue
        if line.startswith("> "):
            scene = line[2:]
            continue
        character, line = line.split(": ", 1)
        data.append([season, episode, scene, character, line])
lines = pd.DataFrame(data, columns=["Season", "Episode", "Scene", "Character", "Line"])

### Matplotlib

In [None]:
fig, axs = plt.subplots(1,figsize=(7, 7), sharex = True, sharey = True)
axs.scatter(prop_20_matched['ratings_treat'],prop_20_matched['ratings_not'])
axs.update({'xlabel':'Treatment Ratings', 'ylabel':'Control Ratings', 'title':'Ratings Scatterplot'})
plt.show()

In [None]:
fig, axs = plt.subplots(1,3,figsize=(10, 3), sharex = True, sharey = True)
sns.histplot(x = prop_20_matched['ratings_treat'], stat="density", bins = 13, ax = axs[0])
sns.histplot(x = prop_20_matched['ratings_not'], stat="density",bins = 13, ax = axs[1])
sns.histplot(x = prop_20_not['ratings'], stat="density", bins = 13, ax = axs[2])

axs[0].set_xlabel('Treatment')
axs[1].set_xlabel('Control - Matched')
axs[2].set_xlabel('Control - All')
fig.suptitle('Ratings Histogram')

plt.show()

In [None]:
accs = []

#the grid of regularization parameter 
grid = [0.01,0.1,1,10,100,1000,10000]

for c in grid:
    
    #initialize the classifier
    clf = LogisticRegression(random_state=0, solver='lbfgs',C = c)
    
    #crossvalidate
    scores = cross_val_score(clf, X_train,Y_train, cv=10)
    accs.append(np.mean(scores))

In [None]:
w_hist, w_bins = np.histogram(df_distribution_bright['distance_from_home'], bins = np.logspace(0,5,50), density=True)
w_bin_centers = w_bins[:-1]

fig, axs = plt.subplots(1,3,figsize=(11,4))
axs[0].plot(w_bin_centers, w_hist)
axs[1].plot(w_bin_centers, np.cumsum(w_hist))
axs[2].plot(w_bin_centers, w_hist)

#axs[0].set_yscale("log")
axs[0].set_xscale("log")
axs[1].set_yscale("log")
axs[1].set_xscale("log")
axs[2].set_yscale("log")
axs[2].set_xscale("log")

axs[0].set_title('PDF')
axs[1].set_title('CDF')
axs[2].set_title('Log Log PDF')

plt.tight_layout()

### Groupby and Apply Functions

In [None]:
df_bright.groupby(['user',pd.Grouper(key="local_time_grouper",
                                                              freq="1D")]).agg({'distance_from_home': 'mean'})

df_bright.groupby(['user',pd.Grouper(key="local_time_grouper", 
                                              freq="1D")]).agg({'season':'max',
                                                                'distance_from_home': 'mean'}).groupby('season')['distance_from_home']

In [None]:
df_bright['local_time_grouper'] = pd.to_datetime(df_bright['local_time_grouper'])

In [None]:
#Split string by spaces
lines["Words"] = lines["Line"].apply(lambda x: len(x.split(' ')))

In [None]:
corpus_frequency = pd.concat([pd.Series(row['Line'].split(' ')) for _, row in lines.iterrows()]).reset_index()

In [None]:
#Swap items in dict (needs to be set of both otherwise won't work)
{f:n for n,f in enumerate(our_books.fileids())}

### Stats Model

In [None]:
mod = smf.ols(formula='time ~ C(diabetes) + C(high_blood_pressure)', data=df)

# Fits the model (find the optimal coefficients, adding a random seed ensures consistency)
np.random.seed(2)
res = mod.fit()

res.summary()

In [None]:
# logit is logistic regression. The other parameters are the same as before

mod = smf.logit(formula='DEATH_EVENT ~  age + creatinine_phosphokinase + ejection_fraction + \
                        platelets + serum_creatinine + serum_sodium + \
                        C(diabetes) + C(high_blood_pressure) +\
                        C(sex) + C(anaemia) + C(smoking) + C(high_blood_pressure)', data=df)
res = mod.fit()
print(res.summary())

In [None]:
scipy.stats.ttest_ind(accept_2020, reject_2020)

### Sklearn

In [None]:
 X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,test_size = test_size, random_state = random_state)

In [None]:
# type of regressor
gb = ensemble.GradientBoostingRegressor()

# settings
parameters = {'n_estimators': [50, 75, 100, 150, 200, 250],
              'learning_rate': [.1, .05, .01]}
y = df['ratings']
X = df.drop('ratings',axis=1)
X = df[NEW_FEATURES]

# splitting the train and test dataset
X_train, X_test, y_train, y_test = split_data_randomly(X,y,test_size = .3, random_state = 1 )

# cross k validation using GridsearchCV
clf = model_selection.GridSearchCV(gb, parameters, cv = 20, scoring = 'r2')
clf.fit(X_train,y_train)
best_param = clf.best_params_

In [None]:
# construction of the confidence interval for each split

confidence_interval=[]
for i in range(result.shape[0]):
    ci = sorted(result.loc[i,:])
    confidence_interval.append([ci[1],ci[18]])

In [None]:
# training the model for our best parameter
optimal_reg = ensemble.GradientBoostingRegressor(**best_param).fit(X_train,y_train)

In [None]:
# prediction and R2
pred = optimal_reg.predict(X_test)
r2 = metrics.r2_score(y_test,pred)

### Text

In [None]:
lines[lines["Character"].isin(recurrent_characters)]

### Networks

In [None]:
# Helper function for plotting the degree distribution of a Graph
def plot_degree_distribution(G):
    degrees = {}
    for node in G.nodes():
        degree = G.degree(node)
        if degree not in degrees:
            degrees[degree] = 0
        degrees[degree] += 1
    sorted_degree = sorted(degrees.items())
    deg = [k for (k,v) in sorted_degree]
    cnt = [v for (k,v) in sorted_degree]
    fig, ax = plt.subplots()
    plt.bar(deg, cnt, width=0.80, color='b')
    plt.title("Degree Distribution")
    plt.ylabel("Frequency")
    plt.xlabel("Degree")
    ax.set_xticks([d+0.05 for d in deg])
    ax.set_xticklabels(deg)
    
# Helper function for printing various graph properties
def describe_graph(G):
    print(nx.info(G))
    if nx.is_connected(G):
        print("Avg. Shortest Path Length: %.4f" %nx.average_shortest_path_length(G))
        print("Diameter: %.4f" %nx.diameter(G)) # Longest shortest path
    else:
        print("Graph is not connected")
        print("Diameter and Avg shortest path length are not defined!")
    print("Sparsity: %.4f" %nx.density(G))  # #edges/#edges-complete-graph
    # #closed-triplets(3*#triangles)/#all-triplets
    print("Global clustering coefficient aka Transitivity: %.4f" %nx.transitivity(G))
    
# Helper function for visualizing the graph
def visualize_graph(G, with_labels=True, k=None, alpha=1.0, node_shape='o'):
    #nx.draw_spring(G, with_labels=with_labels, alpha = alpha)
    pos = nx.spring_layout(G, k=k)
    if with_labels:
        lab = nx.draw_networkx_labels(G, pos, labels=dict([(n, n) for n in G.nodes()]))
    ec = nx.draw_networkx_edges(G, pos, alpha=alpha)
    nc = nx.draw_networkx_nodes(G, pos, nodelist=G.nodes(), node_color='g', node_shape=node_shape)
    plt.axis('off')

In [None]:
#Load from pandas dataframe
quakerG =nx.from_pandas_edgelist(edges, 'Source', 'Target', edge_attr=None, create_using= nx.Graph())
describe_graph(quakerG)

In [3]:
#IMPORTANCE
#* **Degree** (generalized by **Katz**)
#* **Betweeness centrality**

degrees = dict(quakerG.degree(quakerG.nodes()))
sorted_degree = sorted(degrees.items(), key=itemgetter(1), reverse=True)

# And the top 5 most popular quakers are.. 
for quaker, degree in sorted_degree[:5]:
    print(quaker, 'who is', quakerG.nodes[quaker]['Role'], 'knows', degree, 'people')
    

degrees = dict(quakerG.degree(quakerG.nodes()))

katz = nx.katz_centrality(quakerG)
nx.set_node_attributes(quakerG, katz, 'katz')
sorted_katz = sorted(katz.items(), key=itemgetter(1), reverse=True)

# And the top 5 most popular quakers are.. 
for quaker, katzc in sorted_katz[:5]:
    print(quaker, 'who is', quakerG.nodes[quaker]['Role'], 'has katz-centrality: %.3f' %katzc)
    
# Compute betweenness centrality
betweenness = nx.betweenness_centrality(quakerG)
# Assign the computed centrality values as a node-attribute in your network
nx.set_node_attributes(quakerG, betweenness, 'betweenness')
sorted_betweenness = sorted(betweenness.items(), key=itemgetter(1), reverse=True)

for quaker, bw in sorted_betweenness[:5]:
    print(quaker, 'who is', quakerG.nodes[quaker]['Role'], 'has betweeness: %.3f' %bw)

In [None]:
for key in common_scenes:
    familiarity_graph.add_edge(key[0], key[1], weight=common_scenes[key])

### Spark

In [None]:
Bombing_Operations.registerTempTable("Bombing_Operations")

query = """
SELECT ContryFlyingMission, count(*) as MissionsCount
FROM Bombing_Operations
GROUP BY ContryFlyingMission
ORDER BY MissionsCount DESC
"""

missions_counts = spark.sql(query)
missions_counts.show()

In [None]:
missions_counts = Bombing_Operations.groupBy("ContryFlyingMission")\
                                    .agg(count("*").alias("MissionsCount"))\
                                    .sort(desc("MissionsCount"))
missions_counts.show()